From 69fa0ad12eea20323c0e0e1ff1f319eb7cdd1974 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 23 Dec 2014 00:09:54 -0800 Subject: [PATCH 001/175] topic arn rename and travis integration --- .travis.yml | 8 ++++++++ dataduct/config/example_config | 3 ++- dataduct/etl_pipeline.py | 6 ++++-- dataduct/pipeline/sns_alarm.py | 8 ++++++-- docs/installation.rst | 3 ++- 5 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..3afa070 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,8 @@ +language: python + +# command to install dependencies +install: + - pip install -r requirements.txt + +# command to run tests +script: nosetests diff --git a/dataduct/config/example_config b/dataduct/config/example_config index d56a811..f030b42 100644 --- a/dataduct/config/example_config +++ b/dataduct/config/example_config @@ -36,7 +36,8 @@ etl: RETRY_DELAY: 10 Minutes DEFAULT_MAX_RETRIES: 0 ETL_BUCKET: FILL_ME_IN - DATA_PIPELINE_TOPIC_ARN: FILL_ME_IN + SNS_TOPIC_ARN_FAILURE: FILL_ME_IN + SNS_TOPIC_ARN_WARNING: FILL_ME_IN DAILY_LOAD_TIME: 1 # run at 1AM UTC bootstrap: diff --git a/dataduct/etl_pipeline.py b/dataduct/etl_pipeline.py index 0b7e930..e002bac 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl_pipeline.py @@ -53,7 +53,7 @@ class ETLPipeline(object): def __init__(self, name, frequency='one-time', ec2_resource_terminate_after='6 Hours', delay=None, emr_cluster_config=None, load_time=None, - max_retries=DEFAULT_MAX_RETRIES): + topic_arn=None, max_retries=DEFAULT_MAX_RETRIES): """Example of docstring on the __init__ method. The __init__ method may be documented in either the class level @@ -85,6 +85,7 @@ def __init__(self, name, frequency='one-time', self.load_hour = load_hour self.load_min = load_min self.max_retries = max_retries + self.topic_arn = topic_arn if emr_cluster_config: self.emr_cluster_config = emr_cluster_config @@ -162,7 +163,8 @@ def create_base_objects(self): # self.sns = None -> Used for testing self.sns = self.create_pipeline_object( object_class=SNSAlarm, - pipeline_name=self.name + topic_arn=self.topic_arn, + pipeline_name=self.name, ) self.default = self.create_pipeline_object( object_class=DefaultObject, diff --git a/dataduct/pipeline/sns_alarm.py b/dataduct/pipeline/sns_alarm.py index e0b1cfa..19b9639 100644 --- a/dataduct/pipeline/sns_alarm.py +++ b/dataduct/pipeline/sns_alarm.py @@ -6,7 +6,7 @@ from .pipeline_object import PipelineObject config = Config() -DATA_PIPELINE_TOPIC_ARN = config.etl['DATA_PIPELINE_TOPIC_ARN'] +SNS_TOPIC_ARN_FAILURE = config.etl['SNS_TOPIC_ARN_FAILURE'] DEFAULT_ROLE = config.ec2['DEFAULT_ROLE'] @@ -18,6 +18,7 @@ def __init__(self, id, pipeline_name=None, failure_message=None, + topic_arn=None, **kwargs): """Constructor for the SNSAlarm class @@ -40,10 +41,13 @@ def __init__(self, 'Error Stack Trace: #{node.errorStackTrace}' ]) + if topic_arn is None: + topic_arn = SNS_TOPIC_ARN_FAILURE + super(SNSAlarm, self).__init__( id=id, type='SnsAlarm', - topicArn=DATA_PIPELINE_TOPIC_ARN, + topicArn=topic_arn, role=DEFAULT_ROLE, subject='Data Pipeline Failure', message=failure_message, diff --git a/docs/installation.rst b/docs/installation.rst index 7fdb39c..5640c25 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -67,7 +67,8 @@ environment variable pointing to the config file location by setting the RETRY_DELAY: 10 Minutes DEFAULT_MAX_RETRIES: 0 ETL_BUCKET: FILL_ME_IN - DATA_PIPELINE_TOPIC_ARN: FILL_ME_IN + SNS_TOPIC_ARN_FAILURE: FILL_ME_IN + SNS_TOPIC_ARN_WARNING: FILL_ME_IN DAILY_LOAD_TIME: 1 # run at 1AM UTC bootstrap: From c5d93af4780e36ec8ea780fefb73901eb4f48e5e Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 23 Dec 2014 10:14:00 -0800 Subject: [PATCH 002/175] QA step --- dataduct/etl_pipeline.py | 6 ++++- dataduct/steps/qa_transform.py | 42 +++++++++++++++++++++++++++++++++ examples/scripts/s3_profiler.py | 6 +++++ 3 files changed, 53 insertions(+), 1 deletion(-) create mode 100644 dataduct/steps/qa_transform.py diff --git a/dataduct/etl_pipeline.py b/dataduct/etl_pipeline.py index e002bac..d777659 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl_pipeline.py @@ -24,6 +24,7 @@ from .steps.load_redshift import LoadRedshiftStep from .steps.sql_command import SqlCommandStep from .steps.transform import TransformStep +from .steps.qa_transform import QATransformStep from .s3.s3_file import S3File from .s3.s3_path import S3Path @@ -340,9 +341,12 @@ def determine_step_class(self, step_type, step_args): """ if step_type == 'transform': step_class = TransformStep - if step_args.get('resource', None) == 'emr-cluster': + if step_args.get('resource', None) == EMR_CLUSTER_STR: step_args['resource'] = self.emr_cluster + elif step_type == 'qa-transform': + step_class = QATransformStep + elif step_type == 'extract-s3': step_class = ExtractS3Step step_args.pop('resource') diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py new file mode 100644 index 0000000..856a579 --- /dev/null +++ b/dataduct/steps/qa_transform.py @@ -0,0 +1,42 @@ +""" +ETL step wrapper for QA step can be executed on Ec2 / EMR +""" +from .transform import TransformStep +from ..config import Config + +config = Config() +SNS_TOPIC_ARN_WARNING = config.etl['SNS_TOPIC_ARN_WARNING'] + + +class QATransformStep(TransformStep): + """QATransform Step class that helps run scripts on resouces for QA checks + """ + + def __init__(self, + id, + script_arguments, + pipeline_name, + sns_topic_arn=SNS_TOPIC_ARN_WARNING, + **kwargs): + """Constructor for the QATransformStep class + + Args: + sns_arn(str): sns topic arn for QA steps + script_arguments(list of str): list of arguments to the script + **kwargs(optional): Keyword arguments directly passed to base class + """ + + if script_arguments is None: + script_arguments = list() + + script_arguments.extend( + [ + "--sns_topic_arn=%s" % sns_topic_arn, + "--test_name=%s" % (pipeline_name + "." + id) + ] + ) + + super(QATransformStep, self).__init__( + id=id, + script_arguments=script_arguments, + **kwargs) diff --git a/examples/scripts/s3_profiler.py b/examples/scripts/s3_profiler.py index a4f8bad..0835d52 100755 --- a/examples/scripts/s3_profiler.py +++ b/examples/scripts/s3_profiler.py @@ -14,8 +14,10 @@ def run_command(command): """Execute a shell command + Args: command(list of str): list of command arguments + Returns: output(str): stdout of command """ @@ -24,8 +26,10 @@ def run_command(command): def recurse_directory(directory_path): """Recursively walk directories and output basic stats on files + Args: directory_path(str): Path to the directory which is read + Returns: result(list of tuples): (filename, count of lines in file, size of file) """ @@ -43,6 +47,7 @@ def recurse_directory(directory_path): def paths_exist(input_directory, paths): """Check if a path exists or not + Args: input_directory(str): input directory to be checked paths(list of str): paths for which one should check the existence. @@ -58,6 +63,7 @@ def profile_input(input_directory, output_directory, output_file_name, fail_if_empty): """Lists statistics for all files located in input directrory. Output is written to a file in the output directory. + Args: input_directory(path): path to the input directory output_directory(path): path to the output directory From ea33a46701855270d92b5a9cb3d200405328a4d0 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 23 Dec 2014 11:31:48 -0800 Subject: [PATCH 003/175] import verbosity fix --- dataduct/config/__init__.py | 2 +- dataduct/etl_pipeline.py | 34 ++++++++++++++-------------- dataduct/pipeline/__init__.py | 18 +++++++++++++++ dataduct/pipeline/ec2_resource.py | 2 +- dataduct/pipeline/emr_resource.py | 2 +- dataduct/pipeline/pipeline_object.py | 6 ++--- dataduct/pipeline/s3_node.py | 6 ++--- dataduct/pipeline/sql_activity.py | 2 +- dataduct/s3/__init__.py | 4 ++++ dataduct/steps/__init__.py | 9 ++++++++ dataduct/steps/emr_streaming.py | 6 ++--- dataduct/steps/etl_step.py | 10 ++++---- dataduct/steps/extract_local.py | 2 +- dataduct/steps/extract_rds.py | 8 +++---- dataduct/steps/extract_redshift.py | 4 ++-- dataduct/steps/extract_s3.py | 2 +- dataduct/steps/load_redshift.py | 4 ++-- dataduct/steps/sql_command.py | 4 ++-- dataduct/steps/transform.py | 4 ++-- 19 files changed, 80 insertions(+), 49 deletions(-) diff --git a/dataduct/config/__init__.py b/dataduct/config/__init__.py index 34e05a5..cca5d9b 100644 --- a/dataduct/config/__init__.py +++ b/dataduct/config/__init__.py @@ -1 +1 @@ -from config import Config +from .config import Config diff --git a/dataduct/etl_pipeline.py b/dataduct/etl_pipeline.py index d777659..836894d 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl_pipeline.py @@ -6,25 +6,25 @@ from .config import Config -from .pipeline.default_object import DefaultObject -from .pipeline.data_pipeline import DataPipeline -from .pipeline.ec2_resource import Ec2Resource -from .pipeline.emr_resource import EmrResource -from .pipeline.redshift_database import RedshiftDatabase -from .pipeline.s3_node import S3Node -from .pipeline.schedule import Schedule -from .pipeline.sns_alarm import SNSAlarm +from .pipeline import DefaultObject +from .pipeline import DataPipeline +from .pipeline import Ec2Resource +from .pipeline import EmrResource +from .pipeline import RedshiftDatabase +from .pipeline import S3Node +from .pipeline import Schedule +from .pipeline import SNSAlarm from .pipeline.utils import list_pipelines -from .steps.emr_streaming import EMRStreamingStep -from .steps.extract_local import ExtractLocalStep -from .steps.extract_rds import ExtractRdsStep -from .steps.extract_redshift import ExtractRedshiftStep -from .steps.extract_s3 import ExtractS3Step -from .steps.load_redshift import LoadRedshiftStep -from .steps.sql_command import SqlCommandStep -from .steps.transform import TransformStep -from .steps.qa_transform import QATransformStep +from .steps import EMRStreamingStep +from .steps import ExtractLocalStep +from .steps import ExtractRdsStep +from .steps import ExtractRedshiftStep +from .steps import ExtractS3Step +from .steps import LoadRedshiftStep +from .steps import SqlCommandStep +from .steps import TransformStep +from .steps import QATransformStep from .s3.s3_file import S3File from .s3.s3_path import S3Path diff --git a/dataduct/pipeline/__init__.py b/dataduct/pipeline/__init__.py index e69de29..630a29c 100644 --- a/dataduct/pipeline/__init__.py +++ b/dataduct/pipeline/__init__.py @@ -0,0 +1,18 @@ +from .activity import Activity +from .copy_activity import CopyActivity +from .data_pipeline import DataPipeline +from .default_object import DefaultObject +from .ec2_resource import Ec2Resource +from .emr_resource import EmrResource +from .emr_activity import EmrActivity +from .mysql_node import MysqlNode +from .pipeline_object import PipelineObject +from .precondition import Precondition +from .redshift_copy_activity import RedshiftCopyActivity +from .redshift_node import RedshiftNode +from .redshift_database import RedshiftDatabase +from .s3_node import S3Node +from .schedule import Schedule +from .shell_command_activity import ShellCommandActivity +from .sns_alarm import SNSAlarm +from .sql_activity import SqlActivity diff --git a/dataduct/pipeline/ec2_resource.py b/dataduct/pipeline/ec2_resource.py index 0231002..30f482d 100644 --- a/dataduct/pipeline/ec2_resource.py +++ b/dataduct/pipeline/ec2_resource.py @@ -4,7 +4,7 @@ from ..config import Config from .pipeline_object import PipelineObject -from ..s3.s3_log_path import S3LogPath +from ..s3 import S3LogPath from .schedule import Schedule from ..utils.exceptions import ETLInputError diff --git a/dataduct/pipeline/emr_resource.py b/dataduct/pipeline/emr_resource.py index 9dc1b9d..c7e719f 100644 --- a/dataduct/pipeline/emr_resource.py +++ b/dataduct/pipeline/emr_resource.py @@ -4,7 +4,7 @@ from ..config import Config from .pipeline_object import PipelineObject -from ..s3.s3_log_path import S3LogPath +from ..s3 import S3LogPath from .schedule import Schedule from ..utils.exceptions import ETLInputError diff --git a/dataduct/pipeline/pipeline_object.py b/dataduct/pipeline/pipeline_object.py index ed09a8e..b84cd32 100644 --- a/dataduct/pipeline/pipeline_object.py +++ b/dataduct/pipeline/pipeline_object.py @@ -3,9 +3,9 @@ """ from collections import defaultdict -from ..s3.s3_path import S3Path -from ..s3.s3_file import S3File -from ..s3.s3_directory import S3Directory +from ..s3 import S3Path +from ..s3 import S3File +from ..s3 import S3Directory from ..utils.exceptions import ETLInputError diff --git a/dataduct/pipeline/s3_node.py b/dataduct/pipeline/s3_node.py index f9c8413..7afe2bd 100644 --- a/dataduct/pipeline/s3_node.py +++ b/dataduct/pipeline/s3_node.py @@ -7,9 +7,9 @@ from .precondition import Precondition from .schedule import Schedule -from ..s3.s3_path import S3Path -from ..s3.s3_file import S3File -from ..s3.s3_directory import S3Directory +from ..s3 import S3Path +from ..s3 import S3File +from ..s3 import S3Directory from ..utils.exceptions import ETLInputError config = Config() diff --git a/dataduct/pipeline/sql_activity.py b/dataduct/pipeline/sql_activity.py index 7d6d315..46a5cc6 100644 --- a/dataduct/pipeline/sql_activity.py +++ b/dataduct/pipeline/sql_activity.py @@ -5,7 +5,7 @@ from .activity import Activity from ..config import Config from .schedule import Schedule -from ..s3.s3_file import S3File +from ..s3 import S3File from ..utils.exceptions import ETLInputError diff --git a/dataduct/s3/__init__.py b/dataduct/s3/__init__.py index e69de29..0b2e46e 100644 --- a/dataduct/s3/__init__.py +++ b/dataduct/s3/__init__.py @@ -0,0 +1,4 @@ +from .s3_file import S3File +from .s3_path import S3Path +from .s3_directory import S3Directory +from .s3_log_path import S3LogPath diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index e69de29..27fe8a5 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -0,0 +1,9 @@ +from emr_streaming import EMRStreamingStep +from extract_local import ExtractLocalStep +from extract_rds import ExtractRdsStep +from extract_redshift import ExtractRedshiftStep +from extract_s3 import ExtractS3Step +from load_redshift import LoadRedshiftStep +from sql_command import SqlCommandStep +from transform import TransformStep +from qa_transform import QATransformStep diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 04e821b..466d16c 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -2,9 +2,9 @@ ETL step wrapper for EmrActivity can be executed on Ec2 """ from .etl_step import ETLStep -from ..pipeline.emr_activity import EmrActivity -from ..s3.s3_file import S3File -from ..s3.s3_path import S3Path +from ..pipeline import EmrActivity +from ..s3 import S3File +from ..s3 import S3Path from ..utils.exceptions import ETLInputError diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 09f6092..8f3be75 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -3,11 +3,11 @@ """ from ..config import Config -from ..pipeline.activity import Activity -from ..pipeline.copy_activity import CopyActivity -from ..pipeline.s3_node import S3Node -from ..s3.s3_path import S3Path -from ..s3.s3_file import S3File +from ..pipeline import Activity +from ..pipeline import CopyActivity +from ..pipeline import S3Node +from ..s3 import S3Path +from ..s3 import S3File from ..utils.exceptions import ETLInputError config = Config() diff --git a/dataduct/steps/extract_local.py b/dataduct/steps/extract_local.py index 34c6132..7edfa5a 100644 --- a/dataduct/steps/extract_local.py +++ b/dataduct/steps/extract_local.py @@ -2,7 +2,7 @@ ETL step wrapper for creating an S3 node for input from local files """ from .etl_step import ETLStep -from ..s3.s3_file import S3File +from ..s3 import S3File class ExtractLocalStep(ETLStep): diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index b363e5d..8ed5207 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -5,10 +5,10 @@ from ..config import Config from .etl_step import ETLStep -from ..pipeline.copy_activity import CopyActivity -from ..pipeline.mysql_node import MysqlNode -from ..pipeline.pipeline_object import PipelineObject -from ..pipeline.shell_command_activity import ShellCommandActivity +from ..pipeline import CopyActivity +from ..pipeline import MysqlNode +from ..pipeline import PipelineObject +from ..pipeline import ShellCommandActivity from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index a94a6f9..72883c2 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -2,8 +2,8 @@ ETL step wrapper for RedshiftCopyActivity to extract data to S3 """ from .etl_step import ETLStep -from ..pipeline.redshift_node import RedshiftNode -from ..pipeline.redshift_copy_activity import RedshiftCopyActivity +from ..pipeline import RedshiftNode +from ..pipeline import RedshiftCopyActivity class ExtractRedshiftStep(ETLStep): diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index a970cea..d3bebcc 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -2,7 +2,7 @@ ETL step wrapper for creating an S3 node for input """ from .etl_step import ETLStep -from ..s3.s3_path import S3Path +from ..s3 import S3Path class ExtractS3Step(ETLStep): diff --git a/dataduct/steps/load_redshift.py b/dataduct/steps/load_redshift.py index 40b9b59..d4d605e 100644 --- a/dataduct/steps/load_redshift.py +++ b/dataduct/steps/load_redshift.py @@ -2,8 +2,8 @@ ETL step wrapper for RedshiftCopyActivity to load data into Redshift """ from .etl_step import ETLStep -from ..pipeline.redshift_node import RedshiftNode -from ..pipeline.redshift_copy_activity import RedshiftCopyActivity +from ..pipeline import RedshiftNode +from ..pipeline import RedshiftCopyActivity class LoadRedshiftStep(ETLStep): diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 11169d4..7a0e470 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -2,8 +2,8 @@ ETL step wrapper for SqlActivity can be executed on Ec2 """ from .etl_step import ETLStep -from ..pipeline.sql_activity import SqlActivity -from ..s3.s3_file import S3File +from ..pipeline import SqlActivity +from ..s3 import S3File from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 6d01c54..2e0eb3c 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -2,8 +2,8 @@ ETL step wrapper for shell command activity can be executed on Ec2 / EMR """ from .etl_step import ETLStep -from ..pipeline.shell_command_activity import ShellCommandActivity -from ..s3.s3_file import S3File +from ..pipeline import ShellCommandActivity +from ..s3 import S3File from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError From 16056a1327bc85685f98b4eff63b0f7c89eb1a1a Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 23 Dec 2014 16:29:42 -0800 Subject: [PATCH 004/175] QA steps --- dataduct/config/config.py | 7 ++ dataduct/etl_pipeline.py | 1 + dataduct/qa/__init__.py | 4 ++ dataduct/qa/check.py | 108 +++++++++++++++++++++++++++++++ dataduct/qa/column_check.py | 83 ++++++++++++++++++++++++ dataduct/qa/count_check.py | 50 ++++++++++++++ dataduct/qa/primary_key_check.py | 37 +++++++++++ dataduct/qa/utils.py | 8 +++ dataduct/steps/qa_transform.py | 2 +- dataduct/steps/transform.py | 50 ++++++++++++++ requirements.txt | 1 + setup.py | 3 +- 12 files changed, 352 insertions(+), 2 deletions(-) create mode 100644 dataduct/qa/__init__.py create mode 100644 dataduct/qa/check.py create mode 100644 dataduct/qa/column_check.py create mode 100644 dataduct/qa/count_check.py create mode 100644 dataduct/qa/primary_key_check.py create mode 100644 dataduct/qa/utils.py diff --git a/dataduct/config/config.py b/dataduct/config/config.py index 95dc965..a38b8fd 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -31,4 +31,11 @@ class Config(object): _shared_config = load_yaml(DataductConfigFiles) def __init__(self): + """Constructor for the config class + """ self.__dict__ = self._shared_config + + def print_config(self): + """Print the config file + """ + print yaml.dump(self._shared_config, default_flow_style=False) diff --git a/dataduct/etl_pipeline.py b/dataduct/etl_pipeline.py index 836894d..d998119 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl_pipeline.py @@ -346,6 +346,7 @@ def determine_step_class(self, step_type, step_args): elif step_type == 'qa-transform': step_class = QATransformStep + step_args['pipeline_name'] = self.name elif step_type == 'extract-s3': step_class = ExtractS3Step diff --git a/dataduct/qa/__init__.py b/dataduct/qa/__init__.py new file mode 100644 index 0000000..d308c43 --- /dev/null +++ b/dataduct/qa/__init__.py @@ -0,0 +1,4 @@ +from check import Check +from count_check import CountCheck +from column_check import ColumnCheck +from primary_key_check import PrimaryKeyCheck diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py new file mode 100644 index 0000000..7a0cfaf --- /dev/null +++ b/dataduct/qa/check.py @@ -0,0 +1,108 @@ +"""Base class for QA steps that provides template function for publishing +""" +from boto.sns import SNSConnection +from ..config import Config +from .utils import render_output + + +config = Config() +SNS_TOPIC_ARN_WARNING = config.etl['SNS_TOPIC_ARN_WARNING'] + + +class Check(object): + """Base class for QA steps that provides template function for publishing + """ + def __init__(self, name, tolerance=0, sns_topic_arn=SNS_TOPIC_ARN_WARNING): + """Constructor for Check class + + Args: + name(str): Name of the QA test + tolerance(float): Error tolerance levels for the ETL + sns_topic_arn(str): sns topic arn for QA test + """ + self.name = name + self.sns_topic_arn = sns_topic_arn + self.tolerance = tolerance + self.alert_func = self.get_sns_alert_function() + + def get_sns_alert_function(self): + """Get a lamdda function for SNS alert publishing + """ + if self.sns_topic_arn is None: + return None + return lambda message, subject: \ + SNSConnection().publish(self.sns_topic_arn, message, subject) + + @property + def success(self): + """True if error rate is below the tolerance levels + """ + return self.error_rate is not None and \ + self.error_rate <= self.tolerance + + @property + def summary(self): + """Summary information about this test. This text must not + contain any PII or otherwise sensitive data that cannot + be published via email. + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success + ] + ) + + @property + def results(self): + """The results of this test. This may contain PII, as it + should only be sent to S3 or Redshift. The default results are empty. + Subclasses may override this. + """ + # The default can just be the summary text as risk isn't increasing + return self.summary + + @property + def error_rate(self): + """The error rate for the QA test + """ + return None + + @property + def export_output(self): + """List of data associated with this check for analytics + """ + return [ + self.name, + 1 if self.success else 0, + self.tolerance, + self.error_rate, + ] + + @property + def alert_subject(self): + """String for alerts in case of calling the alert_func + """ + return "Failure on %s" % self.name + + def publish(self, export_func=None): + """Publish the results of the QA test + + Note: + Prints result summary, Exports check data, Call the alert function + if specified + """ + + # Print results for logs + print self.results + print self.summary + + if export_func is not None: + export_func(self.export_output) + + if not self.success: + if self.alert_func is not None: + # Send summary to alert func for further publishing + self.alert_func(self.summary, self.alert_subject) + else: + raise Exception(self.alert_subject) diff --git a/dataduct/qa/column_check.py b/dataduct/qa/column_check.py new file mode 100644 index 0000000..cc5e172 --- /dev/null +++ b/dataduct/qa/column_check.py @@ -0,0 +1,83 @@ +"""QA test for comparing columns in the source system with the Warehouse +""" +from .check import Check +from .utils import render_output + + +class ColumnCheck(Check): + """QA test for comparing columns across the ETL + """ + def __init__(self, source_data, destination_data, **kwargs): + """Constructor for the Count based QA + + Args: + source_data(DataFrame): Sample of source data + destination_data(DataFrame): Sample of destination data + """ + super(ColumnCheck, self).__init__(**kwargs) + self.source_data = source_data + self.destination_data = destination_data + self.errors = [] + self.observed = 0 + + # Identify errors + for key in source_data.index: + if key not in destination_data.index: + continue + + source_value = ColumnCheck.column_value(self.source_data, key) + dest_value = ColumnCheck.column_value(self.destination_data, key) + + if source_value != dest_value: + self.errors.append((key, source_value, dest_value)) + self.observed += 1 + + @property + def error_rate(self): + """The error rate for the column comparisons + + Note: + The error is only calculated for keys that exist in both dataframes. + Thus, we presume that issues dealing with row counts are addressed + in a separate QA test. + """ + if self.observed == 0: + return None + + return float(len(self.errors) * 100) / self.observed + + @staticmethod + def column_value(data, key): + """Fetch the value for a key in the dataframe + + Args: + data(DataFrame): Single column dataframe + key(str): Key to lookup in the dataframe + + Returns: + value(str): Value for the key, unicode values are encoded as utf-8 + """ + value = data.loc[key].values[0] + if isinstance(value, unicode): + return value.encode('utf-8') + return value + + @property + def summary(self): + """Summary of the test results for the SNS message + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success, + 'Tolerance: %0.4f%%' % self.tolerance, + 'Error Rate: %0.4f%%' % self.error_rate, + 'Observed: %d' % self.observed, + ] + ) + + @property + def results(self): + """Results from the the comparison of the errors + """ + return render_output([str(a) for a in self.errors]) diff --git a/dataduct/qa/count_check.py b/dataduct/qa/count_check.py new file mode 100644 index 0000000..7a4437e --- /dev/null +++ b/dataduct/qa/count_check.py @@ -0,0 +1,50 @@ +"""QA test for comparing number of rows in the source system with the Warehouse +""" + +from .check import Check +from .utils import render_output + + +class CountCheck(Check): + """QA test for comparing number of rows across the ETL + """ + def __init__(self, source_count, destination_count, **kwargs): + """Constructor for the Count based QA + + Args: + source_count(int): Count of objects in the source system + destination_count(int): Count of objects in the warehouse + """ + super(CountCheck, self).__init__(**kwargs) + self.source_count = source_count + self.destination_count = destination_count + + @property + def error_rate(self): + """The error rate. + If there are no values in the source or destination, the error is 0. + If there are no values in the source but some in the destination, + the error is None + """ + if self.source_count > 0: + error_difference = float(self.source_count - self.destination_count) + return abs(error_difference * 100) / self.source_count + elif self.destination_count == 0: + return 0 + else: + return None + + @property + def summary(self): + """Summary of the test results for the SNS message + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success, + 'Tolerance: %0.4f%%' % self.tolerance, + 'Error Rate: %0.4f%%' % self.error_rate, + 'Source Count: %d' % self.source_count, + 'Destination Count: %d' % self.destination_count, + ] + ) diff --git a/dataduct/qa/primary_key_check.py b/dataduct/qa/primary_key_check.py new file mode 100644 index 0000000..86cf061 --- /dev/null +++ b/dataduct/qa/primary_key_check.py @@ -0,0 +1,37 @@ +"""QA test for we have duplicate primary keys inside redshift +""" + +from .check import Check +from .utils import render_output + + +class PrimaryKeyCheck(Check): + """QA test for checking duplicate primary keys inside redshift + """ + def __init__(self, duplicate_count=0, **kwargs): + """Constructor for Primary Key Check + + Args: + duplicate_count(int): Number of duplicates + """ + super(PrimaryKeyCheck, self).__init__(**kwargs) + self.duplicate_count = duplicate_count + + @property + def error_rate(self): + """The error rate for the QA test + """ + return self.duplicate_count + + @property + def summary(self): + """Summary of the test results for the SNS message + """ + return render_output( + [ + 'Test Name: %s' % self.name, + 'Success: %s' % self.success, + 'Tolerance: %d' % self.tolerance, + 'Error Rate: %d' % self.error_rate, + ] + ) diff --git a/dataduct/qa/utils.py b/dataduct/qa/utils.py new file mode 100644 index 0000000..43b1865 --- /dev/null +++ b/dataduct/qa/utils.py @@ -0,0 +1,8 @@ +""" +Shared utility functions +""" + +def render_output(data): + """Print the formatted output for the list + """ + return '\n'.join(['[Dataduct]: '].extend(data)) diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index 856a579..809ed82 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -14,8 +14,8 @@ class QATransformStep(TransformStep): def __init__(self, id, - script_arguments, pipeline_name, + script_arguments=None, sns_topic_arn=SNS_TOPIC_ARN_WARNING, **kwargs): """Constructor for the QATransformStep class diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 2e0eb3c..e9febba 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -7,6 +7,9 @@ from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError +SCRIPT_ARGUMENT_TYPE_STRING = 'string' +SCRIPT_ARGUMENT_TYPE_SQL = 'sql' + class TransformStep(ETLStep): """Transform Step class that helps run scripts on resouces @@ -48,6 +51,8 @@ def __init__(self, if script: script = self.create_script(S3File(path=script)) + script_arguments = self.translate_arguments(script_arguments) + self.create_pipeline_object( object_class=ShellCommandActivity, input_node=self._input_node, @@ -68,3 +73,48 @@ def __init__(self, self._output = self.create_output_nodes(output_node, output) else: self._output = output_node + + def translate_arguments(self, script_arguments): + """Translate script argument to lists + + Args: + script_arguments(list of str/dict): arguments to the script + + Note: + Dict: (k -> v) is turned into an argument "--k=v" + List: Either pure strings or dictionaries with name, type and value + """ + if script_arguments is None: + return script_arguments + + elif isinstance(script_arguments, list): + result = list() + for argument in script_arguments: + if isinstance(argument, dict): + argument_type = argument.get('type', + SCRIPT_ARGUMENT_TYPE_STRING) + if argument_type == SCRIPT_ARGUMENT_TYPE_SQL: + # TODO: Change to SQL Parsing + result.append(self.input_format( + argument['name'], argument['value'])) + else: + result.append(self.input_format( + argument['name'], argument['value'])) + else: + result.append(str(argument)) + return result + + elif isinstance(script_arguments, dict): + return [self.input_format(key, value) + for key, value in script_arguments.iteritems()] + + elif isinstance(script_arguments, str): + return [script_arguments] + + else: + raise ETLInputError('Script Arguments for unrecognized type') + + def input_format(self, key, value): + """Format the key and value to command line arguments + """ + return ''.join('--', key, '=', value) diff --git a/requirements.txt b/requirements.txt index 1007ddc..3673b70 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ Sphinx==1.2.3 boto==2.34.0 sphinx-rtd-theme==0.1.6 sphinxcontrib-napoleon==0.2.8 +pandas==0.14.1 diff --git a/setup.py b/setup.py index e4bbd7b..1584bdb 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ 'dataduct', 'dataduct.config', 'dataduct.pipeline', + 'dataduct.qa', 'dataduct.s3', 'dataduct.steps', 'dataduct.utils', @@ -21,7 +22,7 @@ long_description=open('README.rst').read(), author_email='data-infra@coursera.org', license='Apache License 2.0', - description='DataPipeline for Humans.', + description='DataPipeline for Humans', install_requires=[ 'boto>=2.32', 'pyyaml' From 95658b64bfda4d7a5e20b23705595919c365f2d6 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 27 Dec 2014 18:14:46 -0800 Subject: [PATCH 005/175] adding base path --- dataduct/config/example_config | 2 +- dataduct/etl_pipeline.py | 9 ++--- dataduct/qa/__init__.py | 8 ++--- docs/installation.rst | 2 +- examples/load_redshift.yaml | 3 ++ examples/scripts/primary_key_test.py | 52 ++++++++++++++++++++++++++++ 6 files changed, 66 insertions(+), 10 deletions(-) create mode 100644 examples/scripts/primary_key_test.py diff --git a/dataduct/config/example_config b/dataduct/config/example_config index f030b42..416fd8c 100644 --- a/dataduct/config/example_config +++ b/dataduct/config/example_config @@ -35,7 +35,7 @@ mysql: etl: RETRY_DELAY: 10 Minutes DEFAULT_MAX_RETRIES: 0 - ETL_BUCKET: FILL_ME_IN + S3_ETL_BUCKET: FILL_ME_IN SNS_TOPIC_ARN_FAILURE: FILL_ME_IN SNS_TOPIC_ARN_WARNING: FILL_ME_IN DAILY_LOAD_TIME: 1 # run at 1AM UTC diff --git a/dataduct/etl_pipeline.py b/dataduct/etl_pipeline.py index d998119..86158d0 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl_pipeline.py @@ -34,7 +34,8 @@ config = Config() DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -ETL_BUCKET = config.etl['ETL_BUCKET'] +S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] +S3_BASE_PATH = config.etl['S3_BASE_PATH'] BOOTSTRAP_STEPS_DEFINITION = config.bootstrap EC2_RESOURCE_STR = 'ec2' @@ -203,16 +204,16 @@ def _s3_uri(self, data_type): raise ETLInputError('Unknown data type found') # Versioning prevents using data from older versions - key = [data_type, self.name, self.version_name] + key = [S3_BASE_PATH, data_type, self.name, self.version_name] if self.frequency == 'daily' and data_type in [LOG_STR, DATA_STR]: # For repeated loads, include load date key.append("#{format(@scheduledStartTime, 'YYYYMMdd')}") if data_type == LOG_STR: - return S3LogPath(key, bucket=ETL_BUCKET, is_directory=True) + return S3LogPath(key, bucket=S3_ETL_BUCKET, is_directory=True) else: - return S3Path(key, bucket=ETL_BUCKET, is_directory=True) + return S3Path(key, bucket=S3_ETL_BUCKET, is_directory=True) @property def s3_log_dir(self): diff --git a/dataduct/qa/__init__.py b/dataduct/qa/__init__.py index d308c43..c660cf5 100644 --- a/dataduct/qa/__init__.py +++ b/dataduct/qa/__init__.py @@ -1,4 +1,4 @@ -from check import Check -from count_check import CountCheck -from column_check import ColumnCheck -from primary_key_check import PrimaryKeyCheck +from .check import Check +from .count_check import CountCheck +from .column_check import ColumnCheck +from .primary_key_check import PrimaryKeyCheck diff --git a/docs/installation.rst b/docs/installation.rst index 5640c25..96e5e6a 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -66,7 +66,7 @@ environment variable pointing to the config file location by setting the etl: RETRY_DELAY: 10 Minutes DEFAULT_MAX_RETRIES: 0 - ETL_BUCKET: FILL_ME_IN + S3_ETL_BUCKET: FILL_ME_IN SNS_TOPIC_ARN_FAILURE: FILL_ME_IN SNS_TOPIC_ARN_WARNING: FILL_ME_IN DAILY_LOAD_TIME: 1 # run at 1AM UTC diff --git a/examples/load_redshift.yaml b/examples/load_redshift.yaml index 735a386..5d7c2cd 100644 --- a/examples/load_redshift.yaml +++ b/examples/load_redshift.yaml @@ -11,3 +11,6 @@ steps: - step_type: load-redshift schema: dev table: test_table + +- step_type: qa-transform + script: examples/scripts/primary_key_test.py diff --git a/examples/scripts/primary_key_test.py b/examples/scripts/primary_key_test.py new file mode 100644 index 0000000..a79a80e --- /dev/null +++ b/examples/scripts/primary_key_test.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + + +import argparse +import pandas.io.sql as pdsql +from dataduct.qa import PrimaryKeyCheck +# from datapipeline.database.table import Table +# from datapipeline.qa.check import Check +# from datapipeline.qa.check import get_sns_alert_fn +# from datapipeline.qa.s3 import qa_check_export_fn +# from datapipeline.data_access.connections import redshift_connection + + +def query_redshift(production, query): + """ + Input: + - prod -- whether to reference the prod table + - query -- a query that computes a count + Output: + - the value returned by the query + """ + print "Running query", query + return pdsql.read_sql(query, redshift_connection(production)) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--table', dest='table', required=True) + parser.add_argument('--production', dest='production', action='store_true') + parser.add_argument('--pipeline_name', dest='pipeline_name', required=True) + + parser.add_argument( + '--sns_topic', dest='sns_topic', default=None) + parser.add_argument( + '--test_name', dest='test_name', default="Check Maestro Column") + + args = parser.parse_args() + print "Got args for check primary key", args + + table = Table(script=args.table) + result = query_redshift( + args.production, + table.select_duplicates_sql().raw_sql(), + ) + + check = PrimaryKeyCheck( + len(result), args.test_name, get_sns_alert_fn(args.sns_topic)) + check.publish(qa_check_export_fn( + args.production, args.pipeline_name, table=table.full_name)) + + print "Passed test." From 941c2dcb341b3f92bf4d9edeac59991d56f69cc0 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 28 Dec 2014 01:57:31 -0800 Subject: [PATCH 006/175] visualize pipelines --- bin/dataduct | 16 ++++++- dataduct/definition_parser.py | 72 +++++++++++++++++++++++++++++ dataduct/pipeline/default_object.py | 4 +- 3 files changed, 88 insertions(+), 4 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index aef6933..bf01748 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -5,15 +5,17 @@ Script that helps create and validate pipelines from command line """ import argparse -from dataduct.definition_parser import read_pipeline_definition +from dataduct.definition_parser import activate_pipeline from dataduct.definition_parser import create_pipeline +from dataduct.definition_parser import read_pipeline_definition from dataduct.definition_parser import validate_pipeline -from dataduct.definition_parser import activate_pipeline +from dataduct.definition_parser import visualize_pipeline CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' +VISUALIZE_STR = 'visualize' def main(): """Main function""" @@ -26,6 +28,7 @@ def main(): CREATE_STR: 'Create a pipeline locally', VALIDATE_STR: 'Validate a pipeline with AWS without activating', ACTIVATE_STR: 'create a pipeline and activate it on AWS', + VISUALIZE_STR: 'visualize a pipeline', }, default=CREATE_STR, ) @@ -42,11 +45,20 @@ def main(): help='Indicates that if this pipeline exists, it will be destroyed' ' first.', ) + parser.add_argument( + '-filename', + '--filename', + default=None, + help='Indicates that if this pipeline exists, it will be destroyed' + ' first.', + ) args = parser.parse_args() for load_definition in args.load_definitions: definition = read_pipeline_definition(load_definition) etl = create_pipeline(definition) + if args.action in [VISUALIZE_STR]: + visualize_pipeline(etl, args.filename) if args.action in [VALIDATE_STR, ACTIVATE_STR]: validate_pipeline(etl, args.force_overwrite) if args.action == ACTIVATE_STR: diff --git a/dataduct/definition_parser.py b/dataduct/definition_parser.py index 2bb55d3..0a32be6 100644 --- a/dataduct/definition_parser.py +++ b/dataduct/definition_parser.py @@ -3,6 +3,10 @@ """ import yaml +from .pipeline import Activity +from .pipeline import MysqlNode +from .pipeline import RedshiftNode +from .pipeline import S3Node from .etl_pipeline import ETLPipeline from .utils.exceptions import ETLInputError @@ -35,6 +39,7 @@ def read_pipeline_definition(file_path): return definition + def create_pipeline(definition): """Creates the pipeline and add the steps specified to the pipeline @@ -50,6 +55,7 @@ def create_pipeline(definition): return etl + def validate_pipeline(etl, force_overwrite=False): """Validates the pipeline that was created @@ -62,6 +68,7 @@ def validate_pipeline(etl, force_overwrite=False): etl.validate() print 'Validated pipeline. Id: %s' % etl.pipeline.id + def activate_pipeline(etl): """Activate the pipeline that was created @@ -70,3 +77,68 @@ def activate_pipeline(etl): """ etl.activate() print 'Activated pipeline. Id: %s' % etl.pipeline.id + + +def visualize_pipeline(etl, filename=None): + """Visualize the pipeline that was created + + Args: + etl(EtlPipeline): pipeline object that needs to be visualized + filename(str): filepath for saving the graph + """ + # Import pygraphviz for plotting the graphs + try: + import pygraphviz + except ImportError: + raise ImportError('Install pygraphviz for visualizing pipelines') + + if filename is None: + raise ETLInputError('Filename must be provided for visualization') + + graph = pygraphviz.AGraph(name=etl.name, directed=True, label=etl.name) + + pipeline_objects = etl.pipeline_objects() + + # Add nodes for all activities + for p_object in pipeline_objects: + if isinstance(p_object, Activity): + graph.add_node(p_object.id, shape='diamond', color='turquoise', + style='filled') + if isinstance(p_object, MysqlNode): + graph.add_node(p_object.id, shape='egg', color='beige', + style='filled') + if isinstance(p_object, RedshiftNode): + graph.add_node(p_object.id, shape='egg', color='goldenrod', + style='filled') + if isinstance(p_object, S3Node): + graph.add_node(p_object.id, shape='folder', color='grey', + style='filled') + + # Add data dependencies + for p_object in pipeline_objects: + if isinstance(p_object, Activity): + if p_object.input: + if isinstance(p_object.input, list): + for ip in p_object.input: + graph.add_edge(ip.id, p_object.id) + else: + graph.add_edge(p_object.input.id, p_object.id) + if p_object.output: + graph.add_edge(p_object.id, p_object.output.id) + + # Add depends_on dependencies + for p_object in pipeline_objects: + if isinstance(p_object, Activity): + if isinstance(p_object.depends_on, list): + dependencies = p_object.depends_on + elif isinstance(p_object.depends_on, Activity): + dependencies = [p_object.depends_on] + else: + continue + + for dependency in dependencies: + graph.add_edge(dependency.id, p_object.id, color='blue') + + # Plotting the graph with dot layout + graph.layout(prog='dot') + graph.draw(filename) diff --git a/dataduct/pipeline/default_object.py b/dataduct/pipeline/default_object.py index 98bbbec..0e3da7e 100644 --- a/dataduct/pipeline/default_object.py +++ b/dataduct/pipeline/default_object.py @@ -15,7 +15,7 @@ class DefaultObject(PipelineObject): """ def __init__(self, - id='Default', + id, sns=None, scheduleType='cron', failureAndRerunMode='CASCADE', @@ -34,7 +34,7 @@ def __init__(self, """ super(DefaultObject, self).__init__( - id=id, + id='Default', # This should always have the default id scheduleType=scheduleType, failureAndRerunMode=failureAndRerunMode, role=DEFAULT_ROLE, From 4e2564db94537f89dd7e5d7e6dd75b4bcc2934c0 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 28 Dec 2014 10:55:00 -0800 Subject: [PATCH 007/175] SNS fix --- .travis.yml | 2 ++ dataduct/etl_pipeline.py | 15 +++++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3afa070..19a9de5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,3 +6,5 @@ install: # command to run tests script: nosetests + +# TODO: Setup config file so that we can actually run this diff --git a/dataduct/etl_pipeline.py b/dataduct/etl_pipeline.py index 86158d0..7108ee9 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl_pipeline.py @@ -36,6 +36,7 @@ DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] S3_BASE_PATH = config.etl['S3_BASE_PATH'] +SNS_TOPIC_ARN_FAILURE = config.etl['SNS_TOPIC_ARN_FAILURE'] BOOTSTRAP_STEPS_DEFINITION = config.bootstrap EC2_RESOURCE_STR = 'ec2' @@ -162,12 +163,14 @@ def create_base_objects(self): load_hour=self.load_hour, load_min=self.load_min, ) - # self.sns = None -> Used for testing - self.sns = self.create_pipeline_object( - object_class=SNSAlarm, - topic_arn=self.topic_arn, - pipeline_name=self.name, - ) + if self.topic_arn is None and SNS_TOPIC_ARN_FAILURE is None: + self.sns = None + else: + self.sns = self.create_pipeline_object( + object_class=SNSAlarm, + topic_arn=self.topic_arn, + pipeline_name=self.name, + ) self.default = self.create_pipeline_object( object_class=DefaultObject, sns=self.sns, From dd1b9f49084e650401849960d88e0e497533c181 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 28 Dec 2014 15:55:41 -0800 Subject: [PATCH 008/175] examples updated --- MANIFEST.in | 1 - dataduct/definition_parser.py | 4 ++++ dataduct/etl_pipeline.py | 6 ++++-- examples/README.md | 2 +- examples/example_bootstrap.yaml | 16 ++++++++++++++++ ...uble_input.yaml => example_double_input.yaml} | 0 ...le_output.yaml => example_double_output.yaml} | 0 ...streaming.yaml => example_emr_streaming.yaml} | 0 ...act_local.yaml => example_extract_local.yaml} | 0 ...extract_rds.yaml => example_extract_rds.yaml} | 0 ...dshift.yaml => example_extract_redshift.yaml} | 0 .../{extract_s3.yaml => example_extract_s3.yaml} | 0 ..._redshift.yaml => example_load_redshift.yaml} | 0 ...sql_command.yaml => example_sql_command.yaml} | 0 .../{transform.yaml => example_transform.yaml} | 0 15 files changed, 25 insertions(+), 4 deletions(-) create mode 100644 examples/example_bootstrap.yaml rename examples/{double_input.yaml => example_double_input.yaml} (100%) rename examples/{double_output.yaml => example_double_output.yaml} (100%) rename examples/{emr_streaming.yaml => example_emr_streaming.yaml} (100%) rename examples/{extract_local.yaml => example_extract_local.yaml} (100%) rename examples/{extract_rds.yaml => example_extract_rds.yaml} (100%) rename examples/{extract_redshift.yaml => example_extract_redshift.yaml} (100%) rename examples/{extract_s3.yaml => example_extract_s3.yaml} (100%) rename examples/{load_redshift.yaml => example_load_redshift.yaml} (100%) rename examples/{sql_command.yaml => example_sql_command.yaml} (100%) rename examples/{transform.yaml => example_transform.yaml} (100%) diff --git a/MANIFEST.in b/MANIFEST.in index adff763..5276a85 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,7 +1,6 @@ include *.txt include *.md include *.rst -include *.sh include *.py recursive-include bin * recursive-include scripts * diff --git a/dataduct/definition_parser.py b/dataduct/definition_parser.py index 0a32be6..471abef 100644 --- a/dataduct/definition_parser.py +++ b/dataduct/definition_parser.py @@ -10,6 +10,8 @@ from .etl_pipeline import ETLPipeline from .utils.exceptions import ETLInputError +URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa + def read_pipeline_definition(file_path): """Function reads the yaml pipeline definitions. @@ -77,6 +79,8 @@ def activate_pipeline(etl): """ etl.activate() print 'Activated pipeline. Id: %s' % etl.pipeline.id + print 'Monitor pipeline here: %s' % \ + URL_TEMPLATE.format(ID=etl.pipeline.id) def visualize_pipeline(etl, filename=None): diff --git a/dataduct/etl_pipeline.py b/dataduct/etl_pipeline.py index 7108ee9..03f4f73 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl_pipeline.py @@ -56,7 +56,8 @@ class ETLPipeline(object): def __init__(self, name, frequency='one-time', ec2_resource_terminate_after='6 Hours', delay=None, emr_cluster_config=None, load_time=None, - topic_arn=None, max_retries=DEFAULT_MAX_RETRIES): + topic_arn=None, max_retries=DEFAULT_MAX_RETRIES, + bootstrap=BOOTSTRAP_STEPS_DEFINITION): """Example of docstring on the __init__ method. The __init__ method may be documented in either the class level @@ -89,6 +90,7 @@ def __init__(self, name, frequency='one-time', self.load_min = load_min self.max_retries = max_retries self.topic_arn = topic_arn + self.bootstrap_definitions = bootstrap if emr_cluster_config: self.emr_cluster_config = emr_cluster_config @@ -590,7 +592,7 @@ def create_bootstrap_steps(self, resource_type): resource_type(enum of str): type of resource we're bootstraping can be ec2 / emr """ - step_params = BOOTSTRAP_STEPS_DEFINITION + step_params = self.bootstrap_definitions selected_steps = list() for step in step_params: step['name'] += '_' + resource_type # Append type for unique names diff --git a/examples/README.md b/examples/README.md index 269fe2c..b1ca096 100644 --- a/examples/README.md +++ b/examples/README.md @@ -1 +1 @@ -#### THIS IS THE EXAMPLES FOLDER \ No newline at end of file +#### Examples diff --git a/examples/example_bootstrap.yaml b/examples/example_bootstrap.yaml new file mode 100644 index 0000000..b16abb6 --- /dev/null +++ b/examples/example_bootstrap.yaml @@ -0,0 +1,16 @@ +name : example_bootstrap +frequency : one-time +load_time: 01:00 # Hour:Min in UTC + +description : Example for the transform step + +bootstrap: +- step_type: transform + input_node: [] + command: pip install git+https://github.com/coursera/dataduct.git >> ${OUTPUT1_STAGING_DIR}/output.txt + name: bootstrap + +steps: +- step_type: transform + input_node: [] + command: python -c "import dataduct" >> ${OUTPUT1_STAGING_DIR}/output.txt diff --git a/examples/double_input.yaml b/examples/example_double_input.yaml similarity index 100% rename from examples/double_input.yaml rename to examples/example_double_input.yaml diff --git a/examples/double_output.yaml b/examples/example_double_output.yaml similarity index 100% rename from examples/double_output.yaml rename to examples/example_double_output.yaml diff --git a/examples/emr_streaming.yaml b/examples/example_emr_streaming.yaml similarity index 100% rename from examples/emr_streaming.yaml rename to examples/example_emr_streaming.yaml diff --git a/examples/extract_local.yaml b/examples/example_extract_local.yaml similarity index 100% rename from examples/extract_local.yaml rename to examples/example_extract_local.yaml diff --git a/examples/extract_rds.yaml b/examples/example_extract_rds.yaml similarity index 100% rename from examples/extract_rds.yaml rename to examples/example_extract_rds.yaml diff --git a/examples/extract_redshift.yaml b/examples/example_extract_redshift.yaml similarity index 100% rename from examples/extract_redshift.yaml rename to examples/example_extract_redshift.yaml diff --git a/examples/extract_s3.yaml b/examples/example_extract_s3.yaml similarity index 100% rename from examples/extract_s3.yaml rename to examples/example_extract_s3.yaml diff --git a/examples/load_redshift.yaml b/examples/example_load_redshift.yaml similarity index 100% rename from examples/load_redshift.yaml rename to examples/example_load_redshift.yaml diff --git a/examples/sql_command.yaml b/examples/example_sql_command.yaml similarity index 100% rename from examples/sql_command.yaml rename to examples/example_sql_command.yaml diff --git a/examples/transform.yaml b/examples/example_transform.yaml similarity index 100% rename from examples/transform.yaml rename to examples/example_transform.yaml From 3abe4133a524047e6e198985c1974199190b02fc Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 28 Dec 2014 20:05:01 -0800 Subject: [PATCH 009/175] connections --- dataduct/config/example_config | 8 +-- dataduct/data_access/__init__.py | 0 dataduct/data_access/connection.py | 45 ++++++++++++++++ dataduct/pipeline/redshift_database.py | 16 +++--- dataduct/utils/helpers.py | 75 ++++++++++++++++++++++++++ docs/installation.rst | 8 +-- 6 files changed, 136 insertions(+), 16 deletions(-) create mode 100644 dataduct/data_access/__init__.py create mode 100644 dataduct/data_access/connection.py diff --git a/dataduct/config/example_config b/dataduct/config/example_config index 416fd8c..171fd31 100644 --- a/dataduct/config/example_config +++ b/dataduct/config/example_config @@ -21,10 +21,10 @@ emr: DEFAULT_CLUSTER_AMI: 2.4.7 redshift: - REDSHIFT_DATABASE_NAME: FILL_ME_IN - REDSHIFT_CLUSTER_ID: FILL_ME_IN - REDSHIFT_USERNAME: FILL_ME_IN - REDSHIFT_PASSWORD: FILL_ME_IN + DATABASE_NAME: FILL_ME_IN + CLUSTER_ID: FILL_ME_IN + USERNAME: FILL_ME_IN + PASSWORD: FILL_ME_IN mysql: DATABASE_KEY: diff --git a/dataduct/data_access/__init__.py b/dataduct/data_access/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py new file mode 100644 index 0000000..839f874 --- /dev/null +++ b/dataduct/data_access/connection.py @@ -0,0 +1,45 @@ +""" +Connections to various databases such as RDS and Redshift +""" +import psycopg2 +import MySQLdb + +from ..config import Config +from ..utils.helpers import retry + +config = Config() + +@retry(2, 60) +def redshift_connection(**kwargs): + """Fetch a psql connection object to redshift + """ + connection = psycopg2.connect( + host=config.redshift['HOST'], + user=config.redshift['USERNAME'], + password=config.redshift['PASSWORD'], + port=config.redshift['PORT'], + database=config.redshift['DATABASE_NAME'], + connect_timeout=10, + **kwargs + ) + + return connection + +@retry(2, 60) +def rds_connection(host_name, cursorclass=MySQLdb.cursors.SSCursor, + **kwargs): + """Fetch a psql connection object to redshift + """ + sql_creds = config.mysql[host_name] + + connection = MySQLdb.connect( + host=sql_creds['HOST'], + user=sql_creds['USERNAME'], + passwd=sql_creds['PASSWORD'], + db=host_name, + charset='utf8', # Necessary for foreign chars + cursorclass=cursorclass, + **kwargs + ) + + return connection diff --git a/dataduct/pipeline/redshift_database.py b/dataduct/pipeline/redshift_database.py index 4302a33..0eacae9 100644 --- a/dataduct/pipeline/redshift_database.py +++ b/dataduct/pipeline/redshift_database.py @@ -6,10 +6,10 @@ from .pipeline_object import PipelineObject config = Config() -REDSHIFT_DATABASE_NAME = config.redshift['REDSHIFT_DATABASE_NAME'] -REDSHIFT_CLUSTER_ID = config.redshift['REDSHIFT_CLUSTER_ID'] -REDSHIFT_USERNAME = config.redshift['REDSHIFT_USERNAME'] -REDSHIFT_PASSWORD = config.redshift['REDSHIFT_PASSWORD'] +DATABASE_NAME = config.redshift['DATABASE_NAME'] +CLUSTER_ID = config.redshift['CLUSTER_ID'] +USERNAME = config.redshift['USERNAME'] +PASSWORD = config.redshift['PASSWORD'] class RedshiftDatabase(PipelineObject): @@ -18,10 +18,10 @@ class RedshiftDatabase(PipelineObject): def __init__(self, id, - database_name=REDSHIFT_DATABASE_NAME, - cluster_id=REDSHIFT_CLUSTER_ID, - username=REDSHIFT_USERNAME, - password=REDSHIFT_PASSWORD): + database_name=DATABASE_NAME, + cluster_id=CLUSTER_ID, + username=USERNAME, + password=PASSWORD): """Constructor for the RedshiftDatabase class Args: diff --git a/dataduct/utils/helpers.py b/dataduct/utils/helpers.py index 9e418c1..c1f7415 100644 --- a/dataduct/utils/helpers.py +++ b/dataduct/utils/helpers.py @@ -1,6 +1,10 @@ """ Shared utility functions """ +import time +import math +from sys import stderr + def exactly_one(*args): """Asserts one of the arguments is not None @@ -9,3 +13,74 @@ def exactly_one(*args): result(bool): True if exactly one of the arguments is not None """ return sum([1 for a in args if a is not None]) == 1 + + +def retry(tries, delay=3, backoff=2): + """Retries a function or method until it succedes + + Note: + This assume the function succeded if no exception was thrown + + Args: + tries(int): Number of attempts of the function. Must be >= 0 + delay(int): Initial delay in seconds, should be > 0 + backoff(int): Factor by which delay should increase between attempts + """ + + if backoff <= 1: + raise ValueError('backoff must be greater than 1') + + tries = math.floor(tries) + if tries < 0: + raise ValueError('tries must be 0 or greater') + + if delay <= 0: + raise ValueError('delay must be greater than 0') + + def deco_retry(f): + """Decorator for retries""" + + def function_attempt(f, *args, **kwargs): + """ + Single attempt of the function + """ + template = 'Attempt failed with Exception: \n{0}: {1}\n' + try: + r_value = f(*args, **kwargs) # first attempt + r_status = True + except Exception as exp: + stderr.write(template.format(type(exp).__name__, exp)) + r_value = exp + r_status = False + + return r_value, r_status + + def f_retry(*args, **kwargs): + """True decorator""" + m_tries, m_delay = tries, delay # make mutable + + r_value, r_status = function_attempt(f, *args, **kwargs) + + while m_tries > 0: + + # Done on success + if r_status is True: + return r_value + + m_tries -= 1 # consume an attempt + time.sleep(m_delay) # wait... + m_delay *= backoff # make future wait longer + + # Try again + r_value, r_status = function_attempt(f, *args, **kwargs) + + if r_status is True: + return r_value + else: + raise r_value + + # true decorator -> decorated function + return f_retry + + # @retry(arg[, ...]) -> true decorator + return deco_retry diff --git a/docs/installation.rst b/docs/installation.rst index 96e5e6a..dae6e2c 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -52,10 +52,10 @@ environment variable pointing to the config file location by setting the DEFAULT_CLUSTER_AMI: 2.4.7 redshift: - REDSHIFT_DATABASE_NAME: FILL_ME_IN - REDSHIFT_CLUSTER_ID: FILL_ME_IN - REDSHIFT_USERNAME: FILL_ME_IN - REDSHIFT_PASSWORD: FILL_ME_IN + DATABASE_NAME: FILL_ME_IN + CLUSTER_ID: FILL_ME_IN + USERNAME: FILL_ME_IN + PASSWORD: FILL_ME_IN mysql: DATABASE_KEY: From aea2dbb40258d7703700a97190174c32bc85f4cd Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 28 Dec 2014 22:09:02 -0800 Subject: [PATCH 010/175] config sync --- bin/dataduct | 24 ++++-- dataduct/config/__init__.py | 2 + dataduct/config/config.py | 76 +++++++++++++++---- dataduct/etl/__init__.py | 5 ++ .../etl_actions.py} | 10 +-- dataduct/{ => etl}/etl_pipeline.py | 55 +++++++------- 6 files changed, 122 insertions(+), 50 deletions(-) create mode 100644 dataduct/etl/__init__.py rename dataduct/{definition_parser.py => etl/etl_actions.py} (96%) rename dataduct/{ => etl}/etl_pipeline.py (96%) diff --git a/bin/dataduct b/bin/dataduct index bf01748..c2528e8 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -5,17 +5,23 @@ Script that helps create and validate pipelines from command line """ import argparse -from dataduct.definition_parser import activate_pipeline -from dataduct.definition_parser import create_pipeline -from dataduct.definition_parser import read_pipeline_definition -from dataduct.definition_parser import validate_pipeline -from dataduct.definition_parser import visualize_pipeline + +from dataduct.etl import activate_pipeline +from dataduct.etl import create_pipeline +from dataduct.etl import read_pipeline_definition +from dataduct.etl import validate_pipeline +from dataduct.etl import visualize_pipeline + +from dataduct.config import sync_to_s3 +from dataduct.config import sync_from_s3 CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' VISUALIZE_STR = 'visualize' +SYNC_CONFIG_TO_S3 = 'sync_config_to_s3' +SYNC_CONFIG_FROM_S3 = 'sync_config_from_s3' def main(): """Main function""" @@ -29,6 +35,8 @@ def main(): VALIDATE_STR: 'Validate a pipeline with AWS without activating', ACTIVATE_STR: 'create a pipeline and activate it on AWS', VISUALIZE_STR: 'visualize a pipeline', + SYNC_CONFIG_TO_S3: 'sync config file from local to s3', + SYNC_CONFIG_FROM_S3: 'sync config file from s3 to local file', }, default=CREATE_STR, ) @@ -54,6 +62,12 @@ def main(): ) args = parser.parse_args() + if args.action == SYNC_CONFIG_TO_S3: + return sync_to_s3() + + if args.action == SYNC_CONFIG_FROM_S3: + return sync_from_s3(args.filename) + for load_definition in args.load_definitions: definition = read_pipeline_definition(load_definition) etl = create_pipeline(definition) diff --git a/dataduct/config/__init__.py b/dataduct/config/__init__.py index cca5d9b..c77771b 100644 --- a/dataduct/config/__init__.py +++ b/dataduct/config/__init__.py @@ -1 +1,3 @@ from .config import Config +from .config import sync_to_s3 +from .config import sync_from_s3 diff --git a/dataduct/config/config.py b/dataduct/config/config.py index a38b8fd..7d74c69 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -1,17 +1,33 @@ +"""Module that maintains the config singleton object used across the package +""" import os import yaml -# We look at (in order of precedence): -# /etc/dataduct.cfg and ~/.dataduct for configuration constants +from ..s3 import S3Path +from ..s3 import S3File -DataductConfigPath = '/etc/.dataduct' -DataductUserConfigPath = os.path.join(os.path.expanduser('~/.dataduct')) -DataductConfigFiles = [DataductConfigPath, DataductUserConfigPath] +CONFIG_STR = 'config' -# Check DATADUCT_PATH env variable for other configuration locations -if 'DATADUCT_PATH' in os.environ: - for path in os.environ['DATADUCT_PATH'].split(":"): - DataductConfigFiles.append(os.path.expanduser(path)) + +def get_config_files(): + """Get the config file for dataduct + + Note: + The order of precedence is: + 1. /etc/dataduct.cfg + 2. ~/.dataduct + 3. DATADUCT_PATH environment variable + """ + dataduct_config_path = '/etc/dataduct.cfg' + dataduct_user_config_path = os.path.join(os.path.expanduser('~/.dataduct')) + config_files = [dataduct_config_path, dataduct_user_config_path] + + # Check DATADUCT_PATH env variable for other configuration locations + if 'DATADUCT_PATH' in os.environ: + for path in os.environ['DATADUCT_PATH'].split(":"): + config_files.append(os.path.expanduser(path)) + + return config_files def load_yaml(configFiles): @@ -25,17 +41,51 @@ def load_yaml(configFiles): raise Exception('Dataduct config file is missing') +def s3_config_path(config): + """S3 uri for the config files + """ + key = [config.etl['S3_BASE_PATH'], CONFIG_STR, 'dataduct.cfg'] + return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) + + +def sync_to_s3(): + """Upload the config file to an S3 location + """ + config = Config() + s3_file = S3File(text=config.raw_config(), s3_path=s3_config_path(config)) + s3_file.upload_to_s3() + + +def sync_from_s3(filename): + """Read the config file from S3 + """ + config = Config() + s3_file = S3File(s3_path=s3_config_path(config)) + text = s3_file.text + + if filename is None: + print text + else: + with open(filename, 'w') as op_file: + op_file.write(text) + + class Config(object): """Config singleton to manage changes config variables across the package """ - _shared_config = load_yaml(DataductConfigFiles) + _shared_config = load_yaml(get_config_files()) def __init__(self): """Constructor for the config class """ self.__dict__ = self._shared_config - def print_config(self): - """Print the config file + def __str__(self): + """String output for the config object + """ + return yaml.dump(self._shared_config, default_flow_style=False) + + def raw_config(self): + """String formatted config file """ - print yaml.dump(self._shared_config, default_flow_style=False) + return self.__str__() diff --git a/dataduct/etl/__init__.py b/dataduct/etl/__init__.py new file mode 100644 index 0000000..a2707bc --- /dev/null +++ b/dataduct/etl/__init__.py @@ -0,0 +1,5 @@ +from .etl_actions import activate_pipeline +from .etl_actions import create_pipeline +from .etl_actions import read_pipeline_definition +from .etl_actions import validate_pipeline +from .etl_actions import visualize_pipeline diff --git a/dataduct/definition_parser.py b/dataduct/etl/etl_actions.py similarity index 96% rename from dataduct/definition_parser.py rename to dataduct/etl/etl_actions.py index 471abef..4f7f0b9 100644 --- a/dataduct/definition_parser.py +++ b/dataduct/etl/etl_actions.py @@ -3,12 +3,12 @@ """ import yaml -from .pipeline import Activity -from .pipeline import MysqlNode -from .pipeline import RedshiftNode -from .pipeline import S3Node +from ..pipeline import Activity +from ..pipeline import MysqlNode +from ..pipeline import RedshiftNode +from ..pipeline import S3Node from .etl_pipeline import ETLPipeline -from .utils.exceptions import ETLInputError +from ..utils.exceptions import ETLInputError URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa diff --git a/dataduct/etl_pipeline.py b/dataduct/etl/etl_pipeline.py similarity index 96% rename from dataduct/etl_pipeline.py rename to dataduct/etl/etl_pipeline.py index 03f4f73..f538ad3 100644 --- a/dataduct/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -4,33 +4,33 @@ from datetime import datetime import yaml -from .config import Config - -from .pipeline import DefaultObject -from .pipeline import DataPipeline -from .pipeline import Ec2Resource -from .pipeline import EmrResource -from .pipeline import RedshiftDatabase -from .pipeline import S3Node -from .pipeline import Schedule -from .pipeline import SNSAlarm -from .pipeline.utils import list_pipelines - -from .steps import EMRStreamingStep -from .steps import ExtractLocalStep -from .steps import ExtractRdsStep -from .steps import ExtractRedshiftStep -from .steps import ExtractS3Step -from .steps import LoadRedshiftStep -from .steps import SqlCommandStep -from .steps import TransformStep -from .steps import QATransformStep - -from .s3.s3_file import S3File -from .s3.s3_path import S3Path -from .s3.s3_log_path import S3LogPath - -from .utils.exceptions import ETLInputError +from ..config import Config + +from ..pipeline import DefaultObject +from ..pipeline import DataPipeline +from ..pipeline import Ec2Resource +from ..pipeline import EmrResource +from ..pipeline import RedshiftDatabase +from ..pipeline import S3Node +from ..pipeline import Schedule +from ..pipeline import SNSAlarm +from ..pipeline.utils import list_pipelines + +from ..steps import EMRStreamingStep +from ..steps import ExtractLocalStep +from ..steps import ExtractRdsStep +from ..steps import ExtractRedshiftStep +from ..steps import ExtractS3Step +from ..steps import LoadRedshiftStep +from ..steps import SqlCommandStep +from ..steps import TransformStep +from ..steps import QATransformStep + +from ..s3.s3_file import S3File +from ..s3.s3_path import S3Path +from ..s3.s3_log_path import S3LogPath + +from ..utils.exceptions import ETLInputError config = Config() DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] @@ -353,6 +353,7 @@ def determine_step_class(self, step_type, step_args): elif step_type == 'qa-transform': step_class = QATransformStep step_args['pipeline_name'] = self.name + step_args['input_node'] = [] elif step_type == 'extract-s3': step_class = ExtractS3Step From c4afe277f230f305617f395491d871f7ea9777f8 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 28 Dec 2014 23:18:51 -0800 Subject: [PATCH 011/175] organize bin file --- bin/dataduct | 64 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 25 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index c2528e8..a62d99f 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -6,16 +6,6 @@ Script that helps create and validate pipelines from command line import argparse -from dataduct.etl import activate_pipeline -from dataduct.etl import create_pipeline -from dataduct.etl import read_pipeline_definition -from dataduct.etl import validate_pipeline -from dataduct.etl import visualize_pipeline - -from dataduct.config import sync_to_s3 -from dataduct.config import sync_from_s3 - - CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' @@ -23,6 +13,40 @@ VISUALIZE_STR = 'visualize' SYNC_CONFIG_TO_S3 = 'sync_config_to_s3' SYNC_CONFIG_FROM_S3 = 'sync_config_from_s3' + +def credential_actions(action, filename): + """Config related actions are executed in this block + """ + from dataduct.config import sync_to_s3 + from dataduct.config import sync_from_s3 + + if action == SYNC_CONFIG_TO_S3: + return sync_to_s3() + + if action == SYNC_CONFIG_FROM_S3: + return sync_from_s3(filename) + + +def pipeline_actions(action, load_definitions, force_overwrite, filename): + """Pipeline related actions are executed in this block + """ + from dataduct.etl import activate_pipeline + from dataduct.etl import create_pipeline + from dataduct.etl import read_pipeline_definition + from dataduct.etl import validate_pipeline + from dataduct.etl import visualize_pipeline + + for load_definition in load_definitions: + definition = read_pipeline_definition(load_definition) + etl = create_pipeline(definition) + if action in [VISUALIZE_STR]: + visualize_pipeline(etl, filename) + if action in [VALIDATE_STR, ACTIVATE_STR]: + validate_pipeline(etl, force_overwrite) + if action == ACTIVATE_STR: + activate_pipeline(etl) + + def main(): """Main function""" parser = argparse.ArgumentParser(description='Run Dataduct commands') @@ -62,21 +86,11 @@ def main(): ) args = parser.parse_args() - if args.action == SYNC_CONFIG_TO_S3: - return sync_to_s3() - - if args.action == SYNC_CONFIG_FROM_S3: - return sync_from_s3(args.filename) - - for load_definition in args.load_definitions: - definition = read_pipeline_definition(load_definition) - etl = create_pipeline(definition) - if args.action in [VISUALIZE_STR]: - visualize_pipeline(etl, args.filename) - if args.action in [VALIDATE_STR, ACTIVATE_STR]: - validate_pipeline(etl, args.force_overwrite) - if args.action == ACTIVATE_STR: - activate_pipeline(etl) + if args.action in [SYNC_CONFIG_TO_S3, SYNC_CONFIG_FROM_S3]: + credential_actions(args.action, args.filename) + else: + pipeline_actions(args.action, args.load_definitions, + args.force_overwrite, args.filename) if __name__ == '__main__': From 79bcc28dcdc072565ec6836f43c6b732381a161d Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 28 Dec 2014 23:25:51 -0800 Subject: [PATCH 012/175] update setup file --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 1584bdb..4e4424b 100644 --- a/setup.py +++ b/setup.py @@ -10,6 +10,8 @@ packages=[ 'dataduct', 'dataduct.config', + 'dataduct.data_access', + 'dataduct.etl', 'dataduct.pipeline', 'dataduct.qa', 'dataduct.s3', @@ -25,7 +27,10 @@ description='DataPipeline for Humans', install_requires=[ 'boto>=2.32', - 'pyyaml' + 'PyYAML', + 'pandas', + 'psycopg2', + 'MySQL-python', ], scripts=['bin/dataduct'], classifiers=[ From a7cb16856c49c94b5321027408ba4b115813ee32 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 29 Dec 2014 15:35:42 -0800 Subject: [PATCH 013/175] relative paths for s3_paths --- dataduct/config/__init__.py | 4 +-- dataduct/config/config.py | 34 ------------------------- dataduct/config/config_actions.py | 37 ++++++++++++++++++++++++++++ dataduct/etl/etl_actions.py | 2 +- dataduct/s3/s3_directory.py | 3 ++- dataduct/s3/s3_file.py | 7 +++--- dataduct/s3/utils.py | 27 ++++++++++++++++++++ examples/scripts/primary_key_test.py | 17 +++++-------- 8 files changed, 78 insertions(+), 53 deletions(-) create mode 100644 dataduct/config/config_actions.py diff --git a/dataduct/config/__init__.py b/dataduct/config/__init__.py index c77771b..d19dbc8 100644 --- a/dataduct/config/__init__.py +++ b/dataduct/config/__init__.py @@ -1,3 +1,3 @@ from .config import Config -from .config import sync_to_s3 -from .config import sync_from_s3 +from .config_actions import sync_to_s3 +from .config_actions import sync_from_s3 diff --git a/dataduct/config/config.py b/dataduct/config/config.py index 7d74c69..9c39e26 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -3,11 +3,6 @@ import os import yaml -from ..s3 import S3Path -from ..s3 import S3File - -CONFIG_STR = 'config' - def get_config_files(): """Get the config file for dataduct @@ -41,35 +36,6 @@ def load_yaml(configFiles): raise Exception('Dataduct config file is missing') -def s3_config_path(config): - """S3 uri for the config files - """ - key = [config.etl['S3_BASE_PATH'], CONFIG_STR, 'dataduct.cfg'] - return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) - - -def sync_to_s3(): - """Upload the config file to an S3 location - """ - config = Config() - s3_file = S3File(text=config.raw_config(), s3_path=s3_config_path(config)) - s3_file.upload_to_s3() - - -def sync_from_s3(filename): - """Read the config file from S3 - """ - config = Config() - s3_file = S3File(s3_path=s3_config_path(config)) - text = s3_file.text - - if filename is None: - print text - else: - with open(filename, 'w') as op_file: - op_file.write(text) - - class Config(object): """Config singleton to manage changes config variables across the package """ diff --git a/dataduct/config/config_actions.py b/dataduct/config/config_actions.py new file mode 100644 index 0000000..55e47c4 --- /dev/null +++ b/dataduct/config/config_actions.py @@ -0,0 +1,37 @@ +""" +Script that has action functions for config +""" +from .config import Config + +from ..s3 import S3Path +from ..s3 import S3File + +config = Config() +CONFIG_STR = 'config' + + +def s3_config_path(): + """S3 uri for the config files + """ + key = [config.etl['S3_BASE_PATH'], CONFIG_STR, 'dataduct.cfg'] + return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) + + +def sync_to_s3(): + """Upload the config file to an S3 location + """ + s3_file = S3File(text=config.raw_config(), s3_path=s3_config_path()) + s3_file.upload_to_s3() + + +def sync_from_s3(filename): + """Read the config file from S3 + """ + s3_file = S3File(s3_path=s3_config_path()) + text = s3_file.text + + if filename is None: + print text + else: + with open(filename, 'w') as op_file: + op_file.write(text) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 4f7f0b9..5b5568d 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -1,5 +1,5 @@ """ -Script that parses the pipeline definition from the yaml schema +Script that parses the pipeline definition and has action functions """ import yaml diff --git a/dataduct/s3/s3_directory.py b/dataduct/s3/s3_directory.py index 10eee88..2cb4c6d 100644 --- a/dataduct/s3/s3_directory.py +++ b/dataduct/s3/s3_directory.py @@ -2,6 +2,7 @@ Base class for storing a S3 File """ from .s3_path import S3Path +from .utils import parse_path from .utils import upload_dir_to_s3 @@ -21,7 +22,7 @@ def __init__(self, path=None, s3_path=None): s3_path (S3Path, optional): s3_path of the file """ - self.path = path + self.path = parse_path(path) self._s3_path = s3_path @property diff --git a/dataduct/s3/s3_file.py b/dataduct/s3/s3_file.py index 0530b7e..aed0fbb 100644 --- a/dataduct/s3/s3_file.py +++ b/dataduct/s3/s3_file.py @@ -4,9 +4,9 @@ from .s3_path import S3Path from .utils import upload_to_s3 from .utils import read_from_s3 +from .utils import parse_path from ..utils.exceptions import ETLInputError - DEFAULT_FILE_NAME = 'file' @@ -32,7 +32,7 @@ def __init__(self, path=None, text=None, s3_path=None): 'Cannot specify both path and text for s3 file.' # Initialize all the values - self._path = path + self._path = parse_path(path) self._text = text self._s3_path = s3_path @@ -73,7 +73,7 @@ def file_name(self): file_name(str): The file_name of this file """ if self._path: - return self._path.split("/").pop() + return self._path.split('/').pop() else: return DEFAULT_FILE_NAME @@ -83,7 +83,6 @@ def s3_path(self): """ return self._s3_path - @s3_path.setter def s3_path(self, s3_path): """Set the S3 path for the file diff --git a/dataduct/s3/utils.py b/dataduct/s3/utils.py index 712cffc..2f07511 100644 --- a/dataduct/s3/utils.py +++ b/dataduct/s3/utils.py @@ -5,8 +5,10 @@ import os from .s3_path import S3Path +from ..config import Config from ..utils.exceptions import ETLInputError +RESOURCE_BASE_PATH = 'RESOURCE_BASE_PATH' def get_s3_bucket(bucket_name): """Returns an S3 bucket object from boto @@ -158,3 +160,28 @@ def delete_dir_from_s3(s3_path): keys = bucket.get_all_keys(prefix=s3_path.key) for key in keys: key.delete() + + +def parse_path(path): + """Change the resource paths for files and directory based on params + + If the path is None, the function returns None. + Else if the path is an absolute path then return the path as is. + Else if the path is a relative path and resource_base_path is declared then + assume the path is relative to the resource_base_path + Else return the path as is. + + Args: + path(str): path specified in the YAML file + """ + # If path is None or absolute + if path is None or os.path.isabs(path): + return path + + # Try relative path to specified config + config = Config() + if RESOURCE_BASE_PATH in config.etl: + return os.path.join(config.etl[RESOURCE_BASE_PATH], path) + + # Return the path as is. + return path diff --git a/examples/scripts/primary_key_test.py b/examples/scripts/primary_key_test.py index a79a80e..8d38f22 100644 --- a/examples/scripts/primary_key_test.py +++ b/examples/scripts/primary_key_test.py @@ -1,14 +1,11 @@ +"""Script that checks for primary key violations on the input table +""" #!/usr/bin/env python - import argparse import pandas.io.sql as pdsql from dataduct.qa import PrimaryKeyCheck -# from datapipeline.database.table import Table -# from datapipeline.qa.check import Check -# from datapipeline.qa.check import get_sns_alert_fn -# from datapipeline.qa.s3 import qa_check_export_fn -# from datapipeline.data_access.connections import redshift_connection +from dataduct.data_access.connection import redshift_connection def query_redshift(production, query): @@ -20,7 +17,7 @@ def query_redshift(production, query): - the value returned by the query """ print "Running query", query - return pdsql.read_sql(query, redshift_connection(production)) + return pdsql.read_sql(query, redshift_connection()) if __name__ == '__main__': @@ -39,10 +36,8 @@ def query_redshift(production, query): print "Got args for check primary key", args table = Table(script=args.table) - result = query_redshift( - args.production, - table.select_duplicates_sql().raw_sql(), - ) + result = pdsql.read_sql( + table.select_duplicates_sql().raw_sql(), redshift_connection()) check = PrimaryKeyCheck( len(result), args.test_name, get_sns_alert_fn(args.sns_topic)) From 3adf87af9121a6ae8aca6118e554167eddaffc77 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 29 Dec 2014 23:47:06 -0800 Subject: [PATCH 014/175] production flag --- bin/dataduct | 38 ++++++++++++++++++++++++++---------- dataduct/config/__init__.py | 2 -- dataduct/config/config.py | 34 ++++++++++++++++++++++++++++---- dataduct/etl/etl_pipeline.py | 4 +++- 4 files changed, 61 insertions(+), 17 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index a62d99f..988ee82 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -6,6 +6,8 @@ Script that helps create and validate pipelines from command line import argparse +from dataduct.config import Config + CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' @@ -14,11 +16,11 @@ SYNC_CONFIG_TO_S3 = 'sync_config_to_s3' SYNC_CONFIG_FROM_S3 = 'sync_config_from_s3' -def credential_actions(action, filename): +def config_actions(action, filename): """Config related actions are executed in this block """ - from dataduct.config import sync_to_s3 - from dataduct.config import sync_from_s3 + from dataduct.config.config_actions import sync_to_s3 + from dataduct.config.config_actions import sync_from_s3 if action == SYNC_CONFIG_TO_S3: return sync_to_s3() @@ -67,27 +69,43 @@ def main(): parser.add_argument( 'load_definitions', nargs='*', - help='Enter the paths of the load definitions.', + help='Enter the paths of the load definitions', ) parser.add_argument( '-f', '--force_overwrite', action='store_true', default=False, - help='Indicates that if this pipeline exists, it will be destroyed' - ' first.', + help='Indicates that if this pipeline exists, it will be destroyed', + ) + parser.add_argument( + '-m', + '--mode', + default=None, + help='Mode to run the pipeline and config overrides to use', ) parser.add_argument( - '-filename', + '-F', '--filename', default=None, - help='Indicates that if this pipeline exists, it will be destroyed' - ' first.', + help='Filename for various actions', ) args = parser.parse_args() + mode = args.mode + if mode is not None: + # We assume mode:dev = mode:None + if mode == 'dev': + mode = None + + # To instantiate the singleton object with the correct state + # As this is the single entry point to the library + # We can use the __new__ function to set the debug_level + config = Config(mode=mode) + print 'Running the pipeline in %s mode.' %config.mode + if args.action in [SYNC_CONFIG_TO_S3, SYNC_CONFIG_FROM_S3]: - credential_actions(args.action, args.filename) + config_actions(args.action, args.filename) else: pipeline_actions(args.action, args.load_definitions, args.force_overwrite, args.filename) diff --git a/dataduct/config/__init__.py b/dataduct/config/__init__.py index d19dbc8..cca5d9b 100644 --- a/dataduct/config/__init__.py +++ b/dataduct/config/__init__.py @@ -1,3 +1 @@ from .config import Config -from .config_actions import sync_to_s3 -from .config_actions import sync_from_s3 diff --git a/dataduct/config/config.py b/dataduct/config/config.py index 9c39e26..c5eebc3 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -39,19 +39,45 @@ def load_yaml(configFiles): class Config(object): """Config singleton to manage changes config variables across the package """ - _shared_config = load_yaml(get_config_files()) + _root_config = load_yaml(get_config_files()) + _isInstantiated = False + _root_mode = None - def __init__(self): + def __new__(cls, mode=None): + """Runs once during class instantiation from the cli file + """ + if not cls._isInstantiated: + if mode is not None: + if mode not in cls._root_config: + raise ValueError('Specified mode not found in config') + + # Override the select fields specified based on mode + for key in cls._root_config[mode]: + cls._root_config[key].update(cls._root_config[mode][key]) + + cls._isInstantiated = True + cls._root_mode = mode + + obj = super(Config, cls).__new__(cls) + return obj + + def __init__(self, mode=None): """Constructor for the config class """ - self.__dict__ = self._shared_config + self.__dict__ = self._root_config def __str__(self): """String output for the config object """ - return yaml.dump(self._shared_config, default_flow_style=False) + return yaml.dump(self._root_config, default_flow_style=False) def raw_config(self): """String formatted config file """ return self.__str__() + + @property + def mode(self): + """Mode which the config was created in + """ + return self._root_mode diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index f538ad3..731dc30 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -596,7 +596,9 @@ def create_bootstrap_steps(self, resource_type): step_params = self.bootstrap_definitions selected_steps = list() for step in step_params: - step['name'] += '_' + resource_type # Append type for unique names + if 'name' in step: + # Append type for unique names + step['name'] += '_' + resource_type # If resource type is specified and doesn't match we skip if 'resource_type' in step: From e8d29bad71fd2189c02c667f6f284156df9f2ed9 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 30 Dec 2014 00:57:12 -0800 Subject: [PATCH 015/175] safe config failures --- .gitignore | 3 +++ dataduct/config/config_actions.py | 3 ++- dataduct/data_access/connection.py | 10 +++++++++ dataduct/etl/etl_pipeline.py | 17 +++++++++------ dataduct/pipeline/copy_activity.py | 4 ++-- dataduct/pipeline/default_object.py | 4 ++-- dataduct/pipeline/ec2_resource.py | 16 ++++++++------ dataduct/pipeline/emr_activity.py | 2 +- dataduct/pipeline/emr_resource.py | 24 +++++++++++---------- dataduct/pipeline/redshift_copy_activity.py | 4 ++-- dataduct/pipeline/redshift_database.py | 5 +++++ dataduct/pipeline/s3_node.py | 2 +- dataduct/pipeline/schedule.py | 2 +- dataduct/pipeline/shell_command_activity.py | 4 ++-- dataduct/pipeline/sns_alarm.py | 4 ++-- dataduct/pipeline/sql_activity.py | 4 ++-- dataduct/s3/s3_path.py | 2 +- dataduct/steps/etl_step.py | 2 +- dataduct/utils/exceptions.py | 22 ++++++++++++++++++- 19 files changed, 91 insertions(+), 43 deletions(-) diff --git a/.gitignore b/.gitignore index d748e3b..3256ab0 100644 --- a/.gitignore +++ b/.gitignore @@ -13,3 +13,6 @@ # Python egg metadata, regenerated from source files by setuptools. /*.egg-info /*.egg + +# Images created should be checked in manually +*.png diff --git a/dataduct/config/config_actions.py b/dataduct/config/config_actions.py index 55e47c4..b3df162 100644 --- a/dataduct/config/config_actions.py +++ b/dataduct/config/config_actions.py @@ -8,12 +8,13 @@ config = Config() CONFIG_STR = 'config' +DATADUCT_FILE_NAME = 'dataduct.cfg' def s3_config_path(): """S3 uri for the config files """ - key = [config.etl['S3_BASE_PATH'], CONFIG_STR, 'dataduct.cfg'] + key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, DATADUCT_FILE_NAME] return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py index 839f874..39cc36b 100644 --- a/dataduct/data_access/connection.py +++ b/dataduct/data_access/connection.py @@ -6,6 +6,7 @@ from ..config import Config from ..utils.helpers import retry +from ..utils.exceptions import ETLConfigError config = Config() @@ -13,6 +14,9 @@ def redshift_connection(**kwargs): """Fetch a psql connection object to redshift """ + if not hasattr(config, 'redshift'): + raise ETLConfigError('Redshift not found in dataduct configs') + connection = psycopg2.connect( host=config.redshift['HOST'], user=config.redshift['USERNAME'], @@ -30,6 +34,12 @@ def rds_connection(host_name, cursorclass=MySQLdb.cursors.SSCursor, **kwargs): """Fetch a psql connection object to redshift """ + if not hasattr(config, 'mysql'): + raise ETLConfigError('mysql not found in dataduct configs') + + if host_name not in config.mysql: + raise ETLConfigError('Config for hostname: %s not found' %host_name) + sql_creds = config.mysql[host_name] connection = MySQLdb.connect( diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 731dc30..86167c1 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -33,11 +33,10 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] -S3_BASE_PATH = config.etl['S3_BASE_PATH'] -SNS_TOPIC_ARN_FAILURE = config.etl['SNS_TOPIC_ARN_FAILURE'] -BOOTSTRAP_STEPS_DEFINITION = config.bootstrap +DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +S3_BASE_PATH = config.etl.get('S3_BASE_PATH', '') +SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) EC2_RESOURCE_STR = 'ec2' EMR_CLUSTER_STR = 'emr' @@ -57,7 +56,7 @@ def __init__(self, name, frequency='one-time', ec2_resource_terminate_after='6 Hours', delay=None, emr_cluster_config=None, load_time=None, topic_arn=None, max_retries=DEFAULT_MAX_RETRIES, - bootstrap=BOOTSTRAP_STEPS_DEFINITION): + bootstrap=None): """Example of docstring on the __init__ method. The __init__ method may be documented in either the class level @@ -90,7 +89,13 @@ def __init__(self, name, frequency='one-time', self.load_min = load_min self.max_retries = max_retries self.topic_arn = topic_arn - self.bootstrap_definitions = bootstrap + + if bootstrap is not None: + self.bootstrap_definitions = bootstrap + elif hasattr(config, 'bootstrap'): + self.bootstrap_definitions = config.bootstrap + else: + self.bootstrap_definitions = list() if emr_cluster_config: self.emr_cluster_config = emr_cluster_config diff --git a/dataduct/pipeline/copy_activity.py b/dataduct/pipeline/copy_activity.py index b7cb604..960eda0 100644 --- a/dataduct/pipeline/copy_activity.py +++ b/dataduct/pipeline/copy_activity.py @@ -9,8 +9,8 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') class CopyActivity(Activity): diff --git a/dataduct/pipeline/default_object.py b/dataduct/pipeline/default_object.py index 0e3da7e..4160f13 100644 --- a/dataduct/pipeline/default_object.py +++ b/dataduct/pipeline/default_object.py @@ -6,8 +6,8 @@ from ..config import Config config = Config() -DEFAULT_ROLE = config.ec2['DEFAULT_ROLE'] -DEFAULT_RESOURCE_ROLE = config.ec2['DEFAULT_RESOURCE_ROLE'] +DEFAULT_ROLE = config.etl['DEFAULT_ROLE'] +DEFAULT_RESOURCE_ROLE = config.etl['DEFAULT_RESOURCE_ROLE'] class DefaultObject(PipelineObject): diff --git a/dataduct/pipeline/ec2_resource.py b/dataduct/pipeline/ec2_resource.py index 30f482d..a7a7802 100644 --- a/dataduct/pipeline/ec2_resource.py +++ b/dataduct/pipeline/ec2_resource.py @@ -9,13 +9,15 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_RESOURCE_ROLE = config.ec2['DEFAULT_RESOURCE_ROLE'] -DEFAULT_EC2_INSTANCE_TYPE = config.ec2['DEFAULT_EC2_INSTANCE_TYPE'] -ETL_AMI = config.ec2['ETL_AMI'] -KEY_PAIR = config.ec2['KEY_PAIR'] -DEFAULT_ROLE = config.ec2['DEFAULT_ROLE'] -SECURITY_GROUP = config.ec2['SECURITY_GROUP'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +DEFAULT_ROLE = config.etl['DEFAULT_ROLE'] +DEFAULT_RESOURCE_ROLE = config.etl['DEFAULT_RESOURCE_ROLE'] + +DEFAULT_EC2_INSTANCE_TYPE = config.ec2.get( + 'DEFAULT_EC2_INSTANCE_TYPE', 'm1.large') +ETL_AMI = config.ec2.get('ETL_AMI', None) +SECURITY_GROUP = config.ec2.get('SECURITY_GROUP', None) +KEY_PAIR = config.etl.get('KEY_PAIR', None) +RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') class Ec2Resource(PipelineObject): diff --git a/dataduct/pipeline/emr_activity.py b/dataduct/pipeline/emr_activity.py index 79c1257..c1a2e2c 100644 --- a/dataduct/pipeline/emr_activity.py +++ b/dataduct/pipeline/emr_activity.py @@ -8,7 +8,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] +DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) class EmrActivity(Activity): diff --git a/dataduct/pipeline/emr_resource.py b/dataduct/pipeline/emr_resource.py index c7e719f..5abb5b6 100644 --- a/dataduct/pipeline/emr_resource.py +++ b/dataduct/pipeline/emr_resource.py @@ -9,17 +9,19 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_NUM_CORE_INSTANCES = config.emr['DEFAULT_NUM_CORE_INSTANCES'] -DEFAULT_CORE_INSTANCE_TYPE = config.emr['DEFAULT_CORE_INSTANCE_TYPE'] -DEFAULT_TASK_INSTANCE_BID_PRICE = config.emr['DEFAULT_TASK_INSTANCE_BID_PRICE'] -DEFAULT_TASK_INSTANCE_TYPE = config.emr['DEFAULT_TASK_INSTANCE_TYPE'] -DEFAULT_MASTER_INSTANCE_TYPE = config.emr['DEFAULT_MASTER_INSTANCE_TYPE'] -DEFAULT_CLUSTER_TIMEOUT = config.emr['DEFAULT_CLUSTER_TIMEOUT'] -DEFAULT_HADOOP_VERSION = config.emr['DEFAULT_HADOOP_VERSION'] -DEFAULT_HIVE_VERSION = config.emr['DEFAULT_HIVE_VERSION'] -DEFAULT_PIG_VERSION = config.emr['DEFAULT_PIG_VERSION'] -DEFAULT_CLUSTER_AMI = config.emr['DEFAULT_CLUSTER_AMI'] -KEY_PAIR = config.ec2['KEY_PAIR'] +DEFAULT_NUM_CORE_INSTANCES = config.emr.get('DEFAULT_NUM_CORE_INSTANCES', None) +DEFAULT_CORE_INSTANCE_TYPE = config.emr.get('DEFAULT_CORE_INSTANCE_TYPE', 'm1.large') +DEFAULT_TASK_INSTANCE_BID_PRICE = config.emr.get( + 'DEFAULT_TASK_INSTANCE_BID_PRICE', None) +DEFAULT_TASK_INSTANCE_TYPE = config.emr.get('DEFAULT_TASK_INSTANCE_TYPE', 'm1.large') +DEFAULT_MASTER_INSTANCE_TYPE = config.emr.get( + 'DEFAULT_MASTER_INSTANCE_TYPE', 'm1.large') +DEFAULT_CLUSTER_TIMEOUT = config.emr.get('DEFAULT_CLUSTER_TIMEOUT', '6 Hours') +DEFAULT_HADOOP_VERSION = config.emr.get('DEFAULT_HADOOP_VERSION', None) +DEFAULT_HIVE_VERSION = config.emr.get('DEFAULT_HIVE_VERSION', None) +DEFAULT_PIG_VERSION = config.emr.get('DEFAULT_PIG_VERSION', None) +DEFAULT_CLUSTER_AMI = config.emr.get('DEFAULT_CLUSTER_AMI', '2.4.7') +KEY_PAIR = config.etl.get('KEY_PAIR', None) class EmrResource(PipelineObject): diff --git a/dataduct/pipeline/redshift_copy_activity.py b/dataduct/pipeline/redshift_copy_activity.py index 0c91a57..18cc1ff 100644 --- a/dataduct/pipeline/redshift_copy_activity.py +++ b/dataduct/pipeline/redshift_copy_activity.py @@ -9,8 +9,8 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') class RedshiftCopyActivity(Activity): diff --git a/dataduct/pipeline/redshift_database.py b/dataduct/pipeline/redshift_database.py index 0eacae9..dfd3649 100644 --- a/dataduct/pipeline/redshift_database.py +++ b/dataduct/pipeline/redshift_database.py @@ -4,8 +4,13 @@ from ..config import Config from .pipeline_object import PipelineObject +from ..utils.exceptions import ETLConfigError config = Config() + +if not hasattr(config, 'redshift'): + raise ETLConfigError('Redshift credentials missing from config') + DATABASE_NAME = config.redshift['DATABASE_NAME'] CLUSTER_ID = config.redshift['CLUSTER_ID'] USERNAME = config.redshift['USERNAME'] diff --git a/dataduct/pipeline/s3_node.py b/dataduct/pipeline/s3_node.py index 7afe2bd..34d1f86 100644 --- a/dataduct/pipeline/s3_node.py +++ b/dataduct/pipeline/s3_node.py @@ -13,7 +13,7 @@ from ..utils.exceptions import ETLInputError config = Config() -RETRY_DELAY = config.etl['RETRY_DELAY'] +RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') class S3Node(PipelineObject): diff --git a/dataduct/pipeline/schedule.py b/dataduct/pipeline/schedule.py index ff2f029..af0359a 100644 --- a/dataduct/pipeline/schedule.py +++ b/dataduct/pipeline/schedule.py @@ -9,7 +9,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DAILY_LOAD_TIME = config.etl['DAILY_LOAD_TIME'] +DAILY_LOAD_TIME = config.etl.get('DAILY_LOAD_TIME', 1) FEQUENCY_PERIOD_CONVERTION = { diff --git a/dataduct/pipeline/shell_command_activity.py b/dataduct/pipeline/shell_command_activity.py index d22bdfa..953828d 100644 --- a/dataduct/pipeline/shell_command_activity.py +++ b/dataduct/pipeline/shell_command_activity.py @@ -9,8 +9,8 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') class ShellCommandActivity(Activity): diff --git a/dataduct/pipeline/sns_alarm.py b/dataduct/pipeline/sns_alarm.py index 19b9639..c822bab 100644 --- a/dataduct/pipeline/sns_alarm.py +++ b/dataduct/pipeline/sns_alarm.py @@ -6,8 +6,8 @@ from .pipeline_object import PipelineObject config = Config() -SNS_TOPIC_ARN_FAILURE = config.etl['SNS_TOPIC_ARN_FAILURE'] -DEFAULT_ROLE = config.ec2['DEFAULT_ROLE'] +SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) +DEFAULT_ROLE = config.etl['DEFAULT_ROLE'] class SNSAlarm(PipelineObject): diff --git a/dataduct/pipeline/sql_activity.py b/dataduct/pipeline/sql_activity.py index 46a5cc6..95407b9 100644 --- a/dataduct/pipeline/sql_activity.py +++ b/dataduct/pipeline/sql_activity.py @@ -10,8 +10,8 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] -RETRY_DELAY = config.etl['RETRY_DELAY'] +DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') class SqlActivity(Activity): diff --git a/dataduct/s3/s3_path.py b/dataduct/s3/s3_path.py index 4b4e36f..fd154dd 100644 --- a/dataduct/s3/s3_path.py +++ b/dataduct/s3/s3_path.py @@ -76,7 +76,7 @@ def append(self, new_key, is_directory=False): new_key = join(*new_key) # Remove duplicate, leading, and trailing '/' - new_key = [a for a in new_key.split("/") if a != ""] + new_key = [a for a in new_key.split("/") if a != ''] # AWS prevents us from using periods in paths # Substitute them with '_' diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 8f3be75..895c190 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -11,7 +11,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl['DEFAULT_MAX_RETRIES'] +DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) class ETLStep(object): diff --git a/dataduct/utils/exceptions.py b/dataduct/utils/exceptions.py index 9c59ebd..f3a1560 100644 --- a/dataduct/utils/exceptions.py +++ b/dataduct/utils/exceptions.py @@ -2,7 +2,6 @@ Exceptions for etl_lib """ - class ETLInputError(Exception): """Error raised when function input is incorrect. @@ -16,5 +15,26 @@ class ETLInputError(Exception): """ def __init__(self, msg, code=2): + """Constructor for the exception + """ + self.msg = msg + self.code = code + + +class ETLConfigError(Exception): + """Error raised when function input is incorrect. + + Args: + msg (str): Human readable string describing the exception. + code (int, optional): Error code, defaults to 2. + + Attributes: + msg (str): Human readable string describing the exception. + code (int): Exception error code. + + """ + def __init__(self, msg, code=2): + """Constructor for the exception + """ self.msg = msg self.code = code From 6fc5c04b710693ae911289e55f94fb0cdceb12fb Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 30 Dec 2014 11:56:25 -0800 Subject: [PATCH 016/175] delay + bootstrap --- bin/dataduct | 14 ++++++-- dataduct/etl/etl_pipeline.py | 52 ++++++++++------------------- dataduct/pipeline/schedule.py | 4 ++- examples/example_bootstrap.yaml | 9 ++--- examples/example_load_redshift.yaml | 3 -- 5 files changed, 38 insertions(+), 44 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index 988ee82..ccc0f77 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -29,7 +29,8 @@ def config_actions(action, filename): return sync_from_s3(filename) -def pipeline_actions(action, load_definitions, force_overwrite, filename): +def pipeline_actions(action, load_definitions, force_overwrite, filename, + delay): """Pipeline related actions are executed in this block """ from dataduct.etl import activate_pipeline @@ -40,6 +41,8 @@ def pipeline_actions(action, load_definitions, force_overwrite, filename): for load_definition in load_definitions: definition = read_pipeline_definition(load_definition) + definition.update({'delay': delay}) + etl = create_pipeline(definition) if action in [VISUALIZE_STR]: visualize_pipeline(etl, filename) @@ -84,6 +87,13 @@ def main(): default=None, help='Mode to run the pipeline and config overrides to use', ) + parser.add_argument( + '-d', + '--delay', + default=0, + type=int, + help='Delay the pipeline by x days', + ) parser.add_argument( '-F', '--filename', @@ -108,7 +118,7 @@ def main(): config_actions(args.action, args.filename) else: pipeline_actions(args.action, args.load_definitions, - args.force_overwrite, args.filename) + args.force_overwrite, args.filename, args.delay) if __name__ == '__main__': diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 86167c1..f547bf8 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -2,6 +2,7 @@ Class definition for DataPipeline """ from datetime import datetime +from copy import deepcopy import yaml from ..config import Config @@ -54,27 +55,22 @@ class ETLPipeline(object): """ def __init__(self, name, frequency='one-time', ec2_resource_terminate_after='6 Hours', - delay=None, emr_cluster_config=None, load_time=None, + delay=0, emr_cluster_config=None, load_time=None, topic_arn=None, max_retries=DEFAULT_MAX_RETRIES, bootstrap=None): - """Example of docstring on the __init__ method. - - The __init__ method may be documented in either the class level - docstring, or as a docstring on the __init__ method itself. - - Either form is acceptable, but the two should not be mixed. Choose one - convention to document the __init__ method and be consistent with it. - - Note: - Do not include the `self` parameter in the ``Args`` section. + """Constructor for the pipeline class Args: name (str): Name of the pipeline should be globally unique. frequency (enum): Frequency of the pipeline. Can be - attr2 (list of str): Description of `attr2`. - attr3 (int): Description of `attr3`. - + ec2_resource_terminate_after (str): Timeout for ec2 resource + delay(int): Number of days to delay the pipeline by + emr_cluster_config(dict): Dictionary for emr config + topic_arn(str): sns alert to be used by the pipeline + max_retries(int): number of retries for pipeline activities + bootstrap(list of steps): bootstrap step definitions for resources """ + if load_time: load_hour, load_min = [int(x) for x in load_time.split(':')] else: @@ -84,9 +80,9 @@ def __init__(self, name, frequency='one-time', self._name = name self.frequency = frequency self.ec2_resource_terminate_after = ec2_resource_terminate_after - self.delay = delay self.load_hour = load_hour self.load_min = load_min + self.delay = delay self.max_retries = max_retries self.topic_arn = topic_arn @@ -95,7 +91,7 @@ def __init__(self, name, frequency='one-time', elif hasattr(config, 'bootstrap'): self.bootstrap_definitions = config.bootstrap else: - self.bootstrap_definitions = list() + self.bootstrap_definitions = dict() if emr_cluster_config: self.emr_cluster_config = emr_cluster_config @@ -352,7 +348,7 @@ def determine_step_class(self, step_type, step_args): """ if step_type == 'transform': step_class = TransformStep - if step_args.get('resource', None) == EMR_CLUSTER_STR: + if step_args.pop('resource_type', None) == EMR_CLUSTER_STR: step_args['resource'] = self.emr_cluster elif step_type == 'qa-transform': @@ -598,24 +594,12 @@ def create_bootstrap_steps(self, resource_type): resource_type(enum of str): type of resource we're bootstraping can be ec2 / emr """ - step_params = self.bootstrap_definitions - selected_steps = list() - for step in step_params: - if 'name' in step: - # Append type for unique names - step['name'] += '_' + resource_type - - # If resource type is specified and doesn't match we skip - if 'resource_type' in step: - if step['resource_type'] != resource_type: - continue - else: - step.pop('resource_type') - - step['resource'] = self.allocate_resource(resource_type) - selected_steps.append(step) + step_params = self.bootstrap_definitions.get(resource_type, list()) + for step_param in step_params: + # Mutating the steps here by adding resource + step_param['resource'] = self.allocate_resource(resource_type) - steps = self.create_steps(selected_steps, True) + steps = self.create_steps(step_params, True) self._bootstrap_steps.extend(steps) return steps diff --git a/dataduct/pipeline/schedule.py b/dataduct/pipeline/schedule.py index af0359a..3da33bf 100644 --- a/dataduct/pipeline/schedule.py +++ b/dataduct/pipeline/schedule.py @@ -52,8 +52,10 @@ def __init__(self, if delay is None: delay = timedelta(0) + elif isinstance(delay, int): + delay = timedelta(days=delay) elif not isinstance(delay, timedelta): - raise ETLInputError('Delay must be an instance of timedelta') + raise ETLInputError('Delay must be an instance of timedelta or int') if frequency in FEQUENCY_PERIOD_CONVERTION: period, occurrences = FEQUENCY_PERIOD_CONVERTION[frequency] diff --git a/examples/example_bootstrap.yaml b/examples/example_bootstrap.yaml index b16abb6..3890e8c 100644 --- a/examples/example_bootstrap.yaml +++ b/examples/example_bootstrap.yaml @@ -5,10 +5,11 @@ load_time: 01:00 # Hour:Min in UTC description : Example for the transform step bootstrap: -- step_type: transform - input_node: [] - command: pip install git+https://github.com/coursera/dataduct.git >> ${OUTPUT1_STAGING_DIR}/output.txt - name: bootstrap + ec2: + - step_type: transform + input_node: [] + command: pip install git+https://github.com/coursera/dataduct.git >> ${OUTPUT1_STAGING_DIR}/output.txt + name: bootstrap_override steps: - step_type: transform diff --git a/examples/example_load_redshift.yaml b/examples/example_load_redshift.yaml index 5d7c2cd..735a386 100644 --- a/examples/example_load_redshift.yaml +++ b/examples/example_load_redshift.yaml @@ -11,6 +11,3 @@ steps: - step_type: load-redshift schema: dev table: test_table - -- step_type: qa-transform - script: examples/scripts/primary_key_test.py From eada1585846f6b4d1ea47a0753f597ac344db5e1 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 30 Dec 2014 20:00:57 -0800 Subject: [PATCH 017/175] step parsing - I --- dataduct/etl/etl_pipeline.py | 23 ++++------------------- dataduct/steps/emr_streaming.py | 11 +++++++++++ dataduct/steps/etl_step.py | 7 +++++++ dataduct/steps/extract_local.py | 16 ++++++++++++++++ dataduct/steps/extract_rds.py | 11 +++++++++++ dataduct/steps/extract_redshift.py | 12 ++++++++++++ dataduct/steps/extract_s3.py | 11 +++++++++++ dataduct/steps/load_redshift.py | 11 +++++++++++ dataduct/steps/qa_transform.py | 12 ++++++++++++ dataduct/steps/sql_command.py | 12 ++++++++++++ dataduct/steps/transform.py | 17 +++++++++++++++-- dataduct/utils/constants.py | 4 ++++ 12 files changed, 126 insertions(+), 21 deletions(-) create mode 100644 dataduct/utils/constants.py diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index f547bf8..867b758 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -32,6 +32,7 @@ from ..s3.s3_log_path import S3LogPath from ..utils.exceptions import ETLInputError +from ..utils import constants as const config = Config() S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] @@ -40,7 +41,6 @@ SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) EC2_RESOURCE_STR = 'ec2' -EMR_CLUSTER_STR = 'emr' LOG_STR = 'logs' DATA_STR = 'data' SRC_STR = 'src' @@ -304,7 +304,7 @@ def emr_cluster(self): **self.emr_cluster_config ) - self.create_bootstrap_steps(EMR_CLUSTER_STR) + self.create_bootstrap_steps(const.EMR_CLUSTER_STR) return self._emr_cluster @property @@ -348,50 +348,35 @@ def determine_step_class(self, step_type, step_args): """ if step_type == 'transform': step_class = TransformStep - if step_args.pop('resource_type', None) == EMR_CLUSTER_STR: - step_args['resource'] = self.emr_cluster elif step_type == 'qa-transform': step_class = QATransformStep - step_args['pipeline_name'] = self.name - step_args['input_node'] = [] elif step_type == 'extract-s3': step_class = ExtractS3Step - step_args.pop('resource') elif step_type == 'extract-local': step_class = ExtractLocalStep - step_args.pop('resource') - if self.frequency != 'one-time': - raise ETLInputError( - 'Extract Local can be used for one-time pipelines only') elif step_type == 'extract-rds': step_class = ExtractRdsStep - step_args.pop('input_node', None) elif step_type == 'extract-redshift': step_class = ExtractRedshiftStep - step_args['redshift_database'] = self.redshift_database - step_args.pop('input_node', None) elif step_type == 'sql-command': step_class = SqlCommandStep - step_args['redshift_database'] = self.redshift_database - step_args.pop('input_node', None) elif step_type == 'emr-streaming': step_class = EMRStreamingStep - step_args['resource'] = self.emr_cluster elif step_type == 'load-redshift': step_class = LoadRedshiftStep - step_args['redshift_database'] = self.redshift_database else: raise ETLInputError('Step type %s not recogonized' % step_type) + step_args = step_class.argument_parser(self, step_args) return step_class, step_args def translate_input_nodes(self, input_node): @@ -580,7 +565,7 @@ def create_steps(self, steps_params, is_bootstrap=False): def allocate_resource(self, resource_type): """Allocate the resource object based on the resource type specified """ - if resource_type == EMR_CLUSTER_STR: + if resource_type == const.EMR_CLUSTER_STR: return self.emr_cluster elif resource_type == EC2_RESOURCE_STR: return self.ec2_resource diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 466d16c..9f363db 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -168,3 +168,14 @@ def merge_s3_nodes(self, input_nodes): depends_on = [] output_node = input_nodes.values() return output_node, depends_on + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args['resource'] = etl.emr_cluster + return step_args diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 895c190..ac54276 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -341,3 +341,10 @@ def activities(self): result: All aws activites that are created for this step """ return [x for x in self._objects.values() if isinstance(x, Activity)] + + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + """ + return step_args diff --git a/dataduct/steps/extract_local.py b/dataduct/steps/extract_local.py index 7edfa5a..0cb82c6 100644 --- a/dataduct/steps/extract_local.py +++ b/dataduct/steps/extract_local.py @@ -3,6 +3,7 @@ """ from .etl_step import ETLStep from ..s3 import S3File +from ..utils.exceptions import ETLInputError class ExtractLocalStep(ETLStep): @@ -18,3 +19,18 @@ def __init__(self, path, **kwargs): """ super(ExtractLocalStep, self).__init__(**kwargs) self._output = self.create_s3_data_node(s3_object=S3File(path=path)) + + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args.pop('resource') + if etl.frequency != 'one-time': + raise ETLInputError( + 'Extract Local can be used for one-time pipelines only') + return step_args diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index 8ed5207..6190f65 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -116,3 +116,14 @@ def __init__(self, resource=self.resource, schedule=self.schedule, ) + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args.pop('input_node', None) + return step_args diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index 72883c2..6eb5205 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -53,3 +53,15 @@ def __init__(self, depends_on=self.depends_on, command_options=["DELIMITER '\t' ESCAPE"], ) + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args['redshift_database'] = etl.redshift_database + step_args.pop('input_node', None) + return step_args diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index d3bebcc..43b1b9e 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -18,3 +18,14 @@ def __init__(self, uri, **kwargs): """ super(ExtractS3Step, self).__init__(**kwargs) self._output = self.create_s3_data_node(s3_object=S3Path(uri=uri)) + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args.pop('resource') + return step_args diff --git a/dataduct/steps/load_redshift.py b/dataduct/steps/load_redshift.py index d4d605e..c63ef32 100644 --- a/dataduct/steps/load_redshift.py +++ b/dataduct/steps/load_redshift.py @@ -63,3 +63,14 @@ def __init__(self, depends_on=self.depends_on, command_options=command_options, ) + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args['redshift_database'] = etl.redshift_database + return step_args diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index 809ed82..f8c88fe 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -40,3 +40,15 @@ def __init__(self, id=id, script_arguments=script_arguments, **kwargs) + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args['pipeline_name'] = etl.name + step_args.pop('input_node', None) + return step_args diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 7a0e470..6984582 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -55,3 +55,15 @@ def __init__(self, script=script, queue=queue, ) + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args['redshift_database'] = etl.redshift_database + step_args.pop('input_node', None) + return step_args diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index e9febba..9b9d462 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -6,11 +6,11 @@ from ..s3 import S3File from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError +from ..utils import constants as const SCRIPT_ARGUMENT_TYPE_STRING = 'string' SCRIPT_ARGUMENT_TYPE_SQL = 'sql' - class TransformStep(ETLStep): """Transform Step class that helps run scripts on resouces """ @@ -114,7 +114,20 @@ def translate_arguments(self, script_arguments): else: raise ETLInputError('Script Arguments for unrecognized type') - def input_format(self, key, value): + @staticmethod + def input_format(key, value): """Format the key and value to command line arguments """ return ''.join('--', key, '=', value) + + @staticmethod + def argument_parser(etl, step_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + if step_args.pop('resource_type', None) == const.EMR_CLUSTER_STR: + step_args['resource'] = etl.emr_cluster + return step_args diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py new file mode 100644 index 0000000..261121c --- /dev/null +++ b/dataduct/utils/constants.py @@ -0,0 +1,4 @@ +"""Constants shared across dataduct +""" + +EMR_CLUSTER_STR = 'emr' From e9fa9ecf865e6f9cc5cf971ff17b866ca54c1dc2 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 30 Dec 2014 23:17:08 -0800 Subject: [PATCH 018/175] step parsing - II --- dataduct/etl/etl_pipeline.py | 167 ++++++++--------------------- dataduct/steps/emr_streaming.py | 6 +- dataduct/steps/etl_step.py | 83 +++++++++++++- dataduct/steps/extract_local.py | 8 +- dataduct/steps/extract_rds.py | 9 +- dataduct/steps/extract_redshift.py | 9 +- dataduct/steps/extract_s3.py | 7 +- dataduct/steps/load_redshift.py | 7 +- dataduct/steps/qa_transform.py | 9 +- dataduct/steps/sql_command.py | 9 +- dataduct/steps/transform.py | 8 +- 11 files changed, 177 insertions(+), 145 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 867b758..d3062c9 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -2,7 +2,6 @@ Class definition for DataPipeline """ from datetime import datetime -from copy import deepcopy import yaml from ..config import Config @@ -27,9 +26,9 @@ from ..steps import TransformStep from ..steps import QATransformStep -from ..s3.s3_file import S3File -from ..s3.s3_path import S3Path -from ..s3.s3_log_path import S3LogPath +from ..s3 import S3File +from ..s3 import S3Path +from ..s3 import S3LogPath from ..utils.exceptions import ETLInputError from ..utils import constants as const @@ -106,7 +105,7 @@ def __init__(self, name, frequency='one-time', self.errors = None self._base_objects = dict() - self._intermediate_nodes = dict() + self.intermediate_nodes = dict() self._steps = dict() self._bootstrap_steps = list() @@ -197,6 +196,15 @@ def name(self): """ return self._name + @property + def steps(self): + """Get the steps of the pipeline + + Returns: + result: steps of the pipeline + """ + return self._steps + def _s3_uri(self, data_type): """Get the S3 location for various data associated with the pipeline @@ -335,50 +343,6 @@ def step(self, step_id): """ return self._steps.get(step_id, None) - def determine_step_class(self, step_type, step_args): - """Determine step class from input to correct ETL step types - - Args: - step_type(str): string specifing step_type of the objects - step_args(dict): dictionary of step arguments - - Returns: - step_class(ETLStep): Class object for the specific step_type - step_args(dict): dictionary of step arguments - """ - if step_type == 'transform': - step_class = TransformStep - - elif step_type == 'qa-transform': - step_class = QATransformStep - - elif step_type == 'extract-s3': - step_class = ExtractS3Step - - elif step_type == 'extract-local': - step_class = ExtractLocalStep - - elif step_type == 'extract-rds': - step_class = ExtractRdsStep - - elif step_type == 'extract-redshift': - step_class = ExtractRedshiftStep - - elif step_type == 'sql-command': - step_class = SqlCommandStep - - elif step_type == 'emr-streaming': - step_class = EMRStreamingStep - - elif step_type == 'load-redshift': - step_class = LoadRedshiftStep - - else: - raise ETLInputError('Step type %s not recogonized' % step_type) - - step_args = step_class.argument_parser(self, step_args) - return step_class, step_args - def translate_input_nodes(self, input_node): """Translate names from YAML to input_nodes @@ -411,9 +375,9 @@ def translate_input_nodes(self, input_node): """ output = dict() for key, value in input_node.iteritems(): - if key not in self._intermediate_nodes: + if key not in self.intermediate_nodes: raise ETLInputError('Input reference does not exist') - output[value] = self._intermediate_nodes[key] + output[value] = self.intermediate_nodes[key] return output def parse_step_args(self, step_type, **kwargs): @@ -431,74 +395,37 @@ def parse_step_args(self, step_type, **kwargs): if not isinstance(step_type, str): raise ETLInputError('Step type must be a string') - # Base dictionary for every step - step_args = { - 'resource': None, - 'schedule': self.schedule, - 'max_retries': self.max_retries, - 'required_steps': list() - } - step_args.update(kwargs) - - # Description is optional and should not be passed - step_args.pop('description', None) - - # Add dependencies - depends_on = step_args.pop('depends_on', None) - if depends_on: - for step_id in list(depends_on): - if step_id not in self._steps: - raise ETLInputError('Step depends on non-existent step') - step_args['required_steps'].append(self._steps[step_id]) - - step_class, step_args = self.determine_step_class(step_type, step_args) - - # Set input node and required_steps - input_node = step_args.get('input_node', None) - if input_node: - if isinstance(input_node, dict): - input_node = self.translate_input_nodes(input_node) - elif isinstance(input_node, str): - input_node = self._intermediate_nodes[input_node] - step_args['input_node'] = input_node - - # Add dependencies from steps that create input nodes - if isinstance(input_node, dict): - required_nodes = input_node.values() - else: - required_nodes = [input_node] - - for required_node in required_nodes: - for step in self._steps.values(): - if step not in step_args['required_steps'] and \ - required_node in step.pipeline_objects: - step_args['required_steps'].append(step) - - # Set resource for the step if not specified or removed - if 'resource' in step_args and step_args['resource'] is None: - step_args['resource'] = self.ec2_resource - - # Set the name if name not provided - if 'name' in step_args: - name = step_args.pop('name') + if step_type == 'transform': + step_class = TransformStep + + elif step_type == 'qa-transform': + step_class = QATransformStep + + elif step_type == 'extract-s3': + step_class = ExtractS3Step + + elif step_type == 'extract-local': + step_class = ExtractLocalStep + + elif step_type == 'extract-rds': + step_class = ExtractRdsStep + + elif step_type == 'extract-redshift': + step_class = ExtractRedshiftStep + + elif step_type == 'sql-command': + step_class = SqlCommandStep + + elif step_type == 'emr-streaming': + step_class = EMRStreamingStep + + elif step_type == 'load-redshift': + step_class = LoadRedshiftStep + else: - # If the name of the step is not provided, one is assigned as: - # [step_class][index] - name = step_class.__name__ + str(sum( - [1 for a in self._steps.values() if isinstance(a, step_class)] - )) - - # Each step is given it's own directory so that there is no clashing - # of file names. - step_args.update({ - 'id': name, - 's3_log_dir': S3LogPath(name, parent_dir=self.s3_log_dir, - is_directory=True), - 's3_data_dir': S3Path(name, parent_dir=self.s3_data_dir, - is_directory=True), - 's3_source_dir': S3Path(name, parent_dir=self.s3_source_dir, - is_directory=True), - }) + raise ETLInputError('Step type %s not recogonized' % step_type) + + step_args = step_class.arguments_processor(self, kwargs) return step_class, step_args @@ -518,9 +445,9 @@ def add_step(self, step, is_bootstrap=False): # Update intermediate_nodes dict if isinstance(step.output, dict): - self._intermediate_nodes.update(step.output) + self.intermediate_nodes.update(step.output) elif step.output and step.id: - self._intermediate_nodes[step.id] = step.output + self.intermediate_nodes[step.id] = step.output def create_steps(self, steps_params, is_bootstrap=False): """Create pipeline steps and add appropriate dependencies diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 9f363db..73f5f1f 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -169,13 +169,15 @@ def merge_s3_nodes(self, input_nodes): output_node = input_nodes.values() return output_node, depends_on - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + step_args = cls.base_arguments_processor(etl, input_args) step_args['resource'] = etl.emr_cluster + return step_args diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index ac54276..907135f 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -1,13 +1,13 @@ """ Base class for an etl step """ - from ..config import Config from ..pipeline import Activity from ..pipeline import CopyActivity from ..pipeline import S3Node from ..s3 import S3Path from ..s3 import S3File +from ..s3 import S3LogPath from ..utils.exceptions import ETLInputError config = Config() @@ -342,9 +342,86 @@ def activities(self): """ return [x for x in self._objects.values() if isinstance(x, Activity)] + @classmethod + def base_arguments_processor(cls, etl, input_args): + """Process the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + input_args(dict): Dictionary of the step arguments from the YAML + """ + # Base dictionary for every step + step_args = { + 'resource': None, + 'schedule': etl.schedule, + 'max_retries': etl.max_retries, + 'required_steps': list() + } + step_args.update(input_args) + + # Description is optional and should not be passed + step_args.pop('description', None) + + # Add dependencies + depends_on = step_args.pop('depends_on', None) + if depends_on: + for step_id in list(depends_on): + if step_id not in etl.steps: + raise ETLInputError('Step depends on non-existent step') + step_args['required_steps'].append(etl.steps[step_id]) + + # Set input node and required_steps + input_node = step_args.get('input_node', None) + if input_node: + if isinstance(input_node, dict): + input_node = etl.translate_input_nodes(input_node) + elif isinstance(input_node, str): + input_node = etl.intermediate_nodes[input_node] + step_args['input_node'] = input_node + + # Add dependencies from steps that create input nodes + if isinstance(input_node, dict): + required_nodes = input_node.values() + else: + required_nodes = [input_node] + + for required_node in required_nodes: + for step in etl.steps.values(): + if step not in step_args['required_steps'] and \ + required_node in step.pipeline_objects: + step_args['required_steps'].append(step) - @staticmethod - def argument_parser(etl, step_args): + # Set the name if name not provided + if 'name' in step_args: + name = step_args.pop('name') + else: + # If the name of the step is not provided, one is assigned as: + # [step_class][index] + name = cls.__name__ + str(sum( + [1 for a in etl.steps.values() if isinstance(a, cls)] + )) + + # Each step is given it's own directory so that there is no clashing + # of file names. + step_args.update({ + 'id': name, + 's3_log_dir': S3LogPath(name, parent_dir=etl.s3_log_dir, + is_directory=True), + 's3_data_dir': S3Path(name, parent_dir=etl.s3_data_dir, + is_directory=True), + 's3_source_dir': S3Path(name, parent_dir=etl.s3_source_dir, + is_directory=True), + }) + + return step_args + + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class """ + step_args = cls.base_arguments_processor(etl, input_args) return step_args diff --git a/dataduct/steps/extract_local.py b/dataduct/steps/extract_local.py index 0cb82c6..635e1bc 100644 --- a/dataduct/steps/extract_local.py +++ b/dataduct/steps/extract_local.py @@ -21,16 +21,20 @@ def __init__(self, path, **kwargs): self._output = self.create_s3_data_node(s3_object=S3File(path=path)) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + input_args.pop('input_node', None) + step_args = cls.base_arguments_processor(etl, input_args) + step_args.pop('resource') if etl.frequency != 'one-time': raise ETLInputError( 'Extract Local can be used for one-time pipelines only') + return step_args diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index 6190f65..9cb9539 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -117,13 +117,16 @@ def __init__(self, schedule=self.schedule, ) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - step_args.pop('input_node', None) + input_args.pop('input_node', None) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index 6eb5205..e60a808 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -54,14 +54,17 @@ def __init__(self, command_options=["DELIMITER '\t' ESCAPE"], ) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + input_args.pop('input_node', None) + step_args = cls.base_arguments_processor(etl, input_args) step_args['redshift_database'] = etl.redshift_database - step_args.pop('input_node', None) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index 43b1b9e..14c485f 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -19,13 +19,16 @@ def __init__(self, uri, **kwargs): super(ExtractS3Step, self).__init__(**kwargs) self._output = self.create_s3_data_node(s3_object=S3Path(uri=uri)) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + input_args.pop('input_node', None) + step_args = cls.base_arguments_processor(etl, input_args) step_args.pop('resource') + return step_args diff --git a/dataduct/steps/load_redshift.py b/dataduct/steps/load_redshift.py index c63ef32..320e616 100644 --- a/dataduct/steps/load_redshift.py +++ b/dataduct/steps/load_redshift.py @@ -64,13 +64,16 @@ def __init__(self, command_options=command_options, ) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + step_args = cls.base_arguments_processor(etl, input_args) step_args['redshift_database'] = etl.redshift_database + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index f8c88fe..138eee7 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -41,14 +41,17 @@ def __init__(self, script_arguments=script_arguments, **kwargs) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + input_args.pop('input_node', None) + step_args = cls.base_arguments_processor(etl, input_args) step_args['pipeline_name'] = etl.name - step_args.pop('input_node', None) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 6984582..2c6b8cb 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -56,14 +56,17 @@ def __init__(self, queue=queue, ) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + input_args.pop('input_node', None) + step_args = cls.base_arguments_processor(etl, input_args) step_args['redshift_database'] = etl.redshift_database - step_args.pop('input_node', None) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 9b9d462..88d7b61 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -120,14 +120,18 @@ def input_format(key, value): """ return ''.join('--', key, '=', value) - @staticmethod - def argument_parser(etl, step_args): + @classmethod + def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline Args: etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ + step_args = cls.base_arguments_processor(etl, input_args) if step_args.pop('resource_type', None) == const.EMR_CLUSTER_STR: step_args['resource'] = etl.emr_cluster + else: + step_args['resource'] = etl.ec2_resource + return step_args From 49e9b103e0f25933e1f6c7adf7af191e10f40b90 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 31 Dec 2014 01:15:16 -0800 Subject: [PATCH 019/175] customs steps --- dataduct/etl/etl_pipeline.py | 36 +++++++++++++++++++ dataduct/s3/s3_directory.py | 2 +- dataduct/s3/s3_file.py | 2 +- dataduct/s3/utils.py | 27 -------------- dataduct/steps/__init__.py | 1 + dataduct/utils/exceptions.py | 37 ++----------------- dataduct/utils/helpers.py | 35 ++++++++++++++++++ examples/example_custom_extract_local.yaml | 10 ++++++ examples/scripts/custom_extract_local.py | 42 ++++++++++++++++++++++ 9 files changed, 128 insertions(+), 64 deletions(-) create mode 100644 examples/example_custom_extract_local.yaml create mode 100644 examples/scripts/custom_extract_local.py diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index d3062c9..610eaf5 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -3,6 +3,7 @@ """ from datetime import datetime import yaml +import imp from ..config import Config @@ -16,6 +17,7 @@ from ..pipeline import SNSAlarm from ..pipeline.utils import list_pipelines +from ..steps import ETLStep from ..steps import EMRStreamingStep from ..steps import ExtractLocalStep from ..steps import ExtractRdsStep @@ -31,6 +33,7 @@ from ..s3 import S3LogPath from ..utils.exceptions import ETLInputError +from ..utils.helpers import parse_path from ..utils import constants as const config = Config() @@ -43,6 +46,7 @@ LOG_STR = 'logs' DATA_STR = 'data' SRC_STR = 'src' +CUSTOM_STEPS_PATH = 'CUSTOM_STEPS_PATH' class ETLPipeline(object): @@ -97,6 +101,8 @@ def __init__(self, name, frequency='one-time', else: self.emr_cluster_config = dict() + self.custom_steps = self.get_custom_steps() + # Pipeline versions self.version_ts = datetime.utcnow() self.version_name = "version_" + \ @@ -380,6 +386,33 @@ def translate_input_nodes(self, input_node): output[value] = self.intermediate_nodes[key] return output + @staticmethod + def get_custom_steps(): + """Fetch the custom steps specified in config + """ + if not hasattr(config, 'custom_steps'): + return dict() + + custom_steps = dict() + + for step_def in config.custom_steps: + step_type = step_def['step_type'] + path = parse_path(step_def['file_path'], CUSTOM_STEPS_PATH) + + # Load source from the file path provided + step_mod = imp.load_source(step_type, path) + + # Get the step class based on class_name provided + step_class = getattr(step_mod, step_def['class_name']) + + # Check if step_class is of type ETLStep + if not issubclass(step_class, ETLStep): + raise ETLInputError('Step type %s is not of type ETLStep') + + custom_steps[step_type] = step_class + + return custom_steps + def parse_step_args(self, step_type, **kwargs): """Parse step arguments from input to correct ETL step types @@ -422,6 +455,9 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'load-redshift': step_class = LoadRedshiftStep + elif step_type in self.custom_steps: + step_class = self.custom_steps[step_type] + else: raise ETLInputError('Step type %s not recogonized' % step_type) diff --git a/dataduct/s3/s3_directory.py b/dataduct/s3/s3_directory.py index 2cb4c6d..e0c845c 100644 --- a/dataduct/s3/s3_directory.py +++ b/dataduct/s3/s3_directory.py @@ -2,8 +2,8 @@ Base class for storing a S3 File """ from .s3_path import S3Path -from .utils import parse_path from .utils import upload_dir_to_s3 +from ..utils.helpers import parse_path class S3Directory(object): diff --git a/dataduct/s3/s3_file.py b/dataduct/s3/s3_file.py index aed0fbb..9076ac1 100644 --- a/dataduct/s3/s3_file.py +++ b/dataduct/s3/s3_file.py @@ -4,7 +4,7 @@ from .s3_path import S3Path from .utils import upload_to_s3 from .utils import read_from_s3 -from .utils import parse_path +from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError DEFAULT_FILE_NAME = 'file' diff --git a/dataduct/s3/utils.py b/dataduct/s3/utils.py index 2f07511..712cffc 100644 --- a/dataduct/s3/utils.py +++ b/dataduct/s3/utils.py @@ -5,10 +5,8 @@ import os from .s3_path import S3Path -from ..config import Config from ..utils.exceptions import ETLInputError -RESOURCE_BASE_PATH = 'RESOURCE_BASE_PATH' def get_s3_bucket(bucket_name): """Returns an S3 bucket object from boto @@ -160,28 +158,3 @@ def delete_dir_from_s3(s3_path): keys = bucket.get_all_keys(prefix=s3_path.key) for key in keys: key.delete() - - -def parse_path(path): - """Change the resource paths for files and directory based on params - - If the path is None, the function returns None. - Else if the path is an absolute path then return the path as is. - Else if the path is a relative path and resource_base_path is declared then - assume the path is relative to the resource_base_path - Else return the path as is. - - Args: - path(str): path specified in the YAML file - """ - # If path is None or absolute - if path is None or os.path.isabs(path): - return path - - # Try relative path to specified config - config = Config() - if RESOURCE_BASE_PATH in config.etl: - return os.path.join(config.etl[RESOURCE_BASE_PATH], path) - - # Return the path as is. - return path diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index 27fe8a5..d7ca553 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -1,3 +1,4 @@ +from etl_step import ETLStep from emr_streaming import EMRStreamingStep from extract_local import ExtractLocalStep from extract_rds import ExtractRdsStep diff --git a/dataduct/utils/exceptions.py b/dataduct/utils/exceptions.py index f3a1560..178517a 100644 --- a/dataduct/utils/exceptions.py +++ b/dataduct/utils/exceptions.py @@ -2,39 +2,6 @@ Exceptions for etl_lib """ -class ETLInputError(Exception): - """Error raised when function input is incorrect. +class ETLInputError(Exception): pass - Args: - msg (str): Human readable string describing the exception. - code (int, optional): Error code, defaults to 2. - - Attributes: - msg (str): Human readable string describing the exception. - code (int): Exception error code. - - """ - def __init__(self, msg, code=2): - """Constructor for the exception - """ - self.msg = msg - self.code = code - - -class ETLConfigError(Exception): - """Error raised when function input is incorrect. - - Args: - msg (str): Human readable string describing the exception. - code (int, optional): Error code, defaults to 2. - - Attributes: - msg (str): Human readable string describing the exception. - code (int): Exception error code. - - """ - def __init__(self, msg, code=2): - """Constructor for the exception - """ - self.msg = msg - self.code = code +class ETLConfigError(Exception): pass diff --git a/dataduct/utils/helpers.py b/dataduct/utils/helpers.py index c1f7415..8d53ae6 100644 --- a/dataduct/utils/helpers.py +++ b/dataduct/utils/helpers.py @@ -3,8 +3,14 @@ """ import time import math +import os from sys import stderr +from ..config import Config + +RESOURCE_BASE_PATH = 'RESOURCE_BASE_PATH' +CUSTOM_STEPS_PATH = 'CUSTOM_STEPS_PATH' + def exactly_one(*args): """Asserts one of the arguments is not None @@ -84,3 +90,32 @@ def f_retry(*args, **kwargs): # @retry(arg[, ...]) -> true decorator return deco_retry + + +def parse_path(path, path_type=RESOURCE_BASE_PATH): + """Change the resource paths for files and directory based on params + + If the path is None, the function returns None. + Else if the path is an absolute path then return the path as is. + Else if the path is a relative path and resource_base_path is declared then + assume the path is relative to the resource_base_path + Else return the path as is. + + Args: + path(str): path specified in the YAML file + """ + # If path is None or absolute + if path is None or os.path.isabs(path): + return path + + # Try relative path to specified config + config = Config() + if path_type == RESOURCE_BASE_PATH: + if RESOURCE_BASE_PATH in config.etl: + return os.path.join(config.etl[RESOURCE_BASE_PATH], path) + else: + if CUSTOM_STEPS_PATH in config.etl: + return os.path.join(config.etl[CUSTOM_STEPS_PATH], path) + + # Return the path as is. + return path diff --git a/examples/example_custom_extract_local.yaml b/examples/example_custom_extract_local.yaml new file mode 100644 index 0000000..aeb8d54 --- /dev/null +++ b/examples/example_custom_extract_local.yaml @@ -0,0 +1,10 @@ +name : example_custom_extract_local +frequency : one-time +load_time: 01:00 # Hour:Min in UTC + +description : | + This example uploads a local file to S3 with the extract-local step. + +steps: +- step_type: custom-extract-local + path: examples/resources/test_table1.tsv diff --git a/examples/scripts/custom_extract_local.py b/examples/scripts/custom_extract_local.py new file mode 100644 index 0000000..8ddbb64 --- /dev/null +++ b/examples/scripts/custom_extract_local.py @@ -0,0 +1,42 @@ +""" +ETL step wrapper for creating an S3 node for input from local files +""" +from dataduct.steps import ETLStep +from dataduct.s3 import S3File +from dataduct.utils.exceptions import ETLInputError + + +class CustomExtractLocalStep(ETLStep): + """ExtractLocal Step class that helps get data from a local file + """ + + def __init__(self, path, **kwargs): + """Constructor for the ExtractLocalStep class + + Args: + path(str): local path for data + **kwargs(optional): Keyword arguments directly passed to base class + """ + print 'Using the Custom Extract Local Step' + + super(CustomExtractLocalStep, self).__init__(**kwargs) + self._output = self.create_s3_data_node(s3_object=S3File(path=path)) + + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args.pop('input_node', None) + step_args = cls.base_arguments_processor(etl, input_args) + + step_args.pop('resource') + if etl.frequency != 'one-time': + raise ETLInputError( + 'Custom Extract Local can be used for one-time pipelines only') + + return step_args From 64c0231751c2cd5ee9d8d1815ec0a914f3e7b1d4 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 31 Dec 2014 01:24:33 -0800 Subject: [PATCH 020/175] better name for constants --- dataduct/config/example_config | 28 +++++++------- dataduct/etl/etl_pipeline.py | 4 +- dataduct/pipeline/copy_activity.py | 4 +- dataduct/pipeline/default_object.py | 8 ++-- dataduct/pipeline/ec2_resource.py | 13 +++---- dataduct/pipeline/emr_activity.py | 4 +- dataduct/pipeline/emr_resource.py | 42 ++++++++++----------- dataduct/pipeline/redshift_copy_activity.py | 4 +- dataduct/pipeline/shell_command_activity.py | 4 +- dataduct/pipeline/sns_alarm.py | 4 +- dataduct/pipeline/sql_activity.py | 4 +- dataduct/steps/etl_step.py | 4 +- docs/installation.rst | 28 +++++++------- 13 files changed, 74 insertions(+), 77 deletions(-) diff --git a/dataduct/config/example_config b/dataduct/config/example_config index 171fd31..9aecaca 100644 --- a/dataduct/config/example_config +++ b/dataduct/config/example_config @@ -1,24 +1,24 @@ # Constants that are used across the dataduct library ec2: - DEFAULT_ROLE: FILL_ME_IN - DEFAULT_RESOURCE_ROLE: FILL_ME_IN - DEFAULT_EC2_INSTANCE_TYPE: m1.large + ROLE: FILL_ME_IN + RESOURCE_ROLE: FILL_ME_IN + INSTANCE_TYPE: m1.large ETL_AMI: ami-05355a6c # Default AMI used by data pipeline KEY_PAIR: FILL_ME_IN SECURITY_GROUP: FILL_ME_IN emr: - DEFAULT_NUM_CORE_INSTANCES: 3 - DEFAULT_CORE_INSTANCE_TYPE: m1.large - DEFAULT_TASK_INSTANCE_BID_PRICE: null # null if we want it to be None - DEFAULT_TASK_INSTANCE_TYPE: m1.large - DEFAULT_MASTER_INSTANCE_TYPE: m1.large - DEFAULT_CLUSTER_TIMEOUT: 6 Hours - DEFAULT_HADOOP_VERSION: null - DEFAULT_HIVE_VERSION: null - DEFAULT_PIG_VERSION: null - DEFAULT_CLUSTER_AMI: 2.4.7 + NUM_CORE_INSTANCES: 3 + CORE_INSTANCE_TYPE: m1.large + TASK_INSTANCE_BID_PRICE: null # null if we want it to be None + TASK_INSTANCE_TYPE: m1.large + MASTER_INSTANCE_TYPE: m1.large + CLUSTER_TIMEOUT: 6 Hours + HADOOP_VERSION: null + HIVE_VERSION: null + PIG_VERSION: null + CLUSTER_AMI: 2.4.7 redshift: DATABASE_NAME: FILL_ME_IN @@ -34,7 +34,7 @@ mysql: etl: RETRY_DELAY: 10 Minutes - DEFAULT_MAX_RETRIES: 0 + MAX_RETRIES: 0 S3_ETL_BUCKET: FILL_ME_IN SNS_TOPIC_ARN_FAILURE: FILL_ME_IN SNS_TOPIC_ARN_WARNING: FILL_ME_IN diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 610eaf5..3242059 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -38,7 +38,7 @@ config = Config() S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] -DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) S3_BASE_PATH = config.etl.get('S3_BASE_PATH', '') SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) @@ -59,7 +59,7 @@ class ETLPipeline(object): def __init__(self, name, frequency='one-time', ec2_resource_terminate_after='6 Hours', delay=0, emr_cluster_config=None, load_time=None, - topic_arn=None, max_retries=DEFAULT_MAX_RETRIES, + topic_arn=None, max_retries=MAX_RETRIES, bootstrap=None): """Constructor for the pipeline class diff --git a/dataduct/pipeline/copy_activity.py b/dataduct/pipeline/copy_activity.py index 960eda0..1809d5a 100644 --- a/dataduct/pipeline/copy_activity.py +++ b/dataduct/pipeline/copy_activity.py @@ -9,7 +9,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') @@ -48,7 +48,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(CopyActivity, self).__init__( id=id, diff --git a/dataduct/pipeline/default_object.py b/dataduct/pipeline/default_object.py index 4160f13..9c3912f 100644 --- a/dataduct/pipeline/default_object.py +++ b/dataduct/pipeline/default_object.py @@ -6,8 +6,8 @@ from ..config import Config config = Config() -DEFAULT_ROLE = config.etl['DEFAULT_ROLE'] -DEFAULT_RESOURCE_ROLE = config.etl['DEFAULT_RESOURCE_ROLE'] +ROLE = config.etl['ROLE'] +RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] class DefaultObject(PipelineObject): @@ -37,7 +37,7 @@ def __init__(self, id='Default', # This should always have the default id scheduleType=scheduleType, failureAndRerunMode=failureAndRerunMode, - role=DEFAULT_ROLE, - resourceRole=DEFAULT_RESOURCE_ROLE, + role=ROLE, + resourceRole=RESOURCE_ROLE, onFail=sns ) diff --git a/dataduct/pipeline/ec2_resource.py b/dataduct/pipeline/ec2_resource.py index a7a7802..39b4880 100644 --- a/dataduct/pipeline/ec2_resource.py +++ b/dataduct/pipeline/ec2_resource.py @@ -9,11 +9,10 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_ROLE = config.etl['DEFAULT_ROLE'] -DEFAULT_RESOURCE_ROLE = config.etl['DEFAULT_RESOURCE_ROLE'] +ROLE = config.etl['ROLE'] +RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] -DEFAULT_EC2_INSTANCE_TYPE = config.ec2.get( - 'DEFAULT_EC2_INSTANCE_TYPE', 'm1.large') +INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', 'm1.large') ETL_AMI = config.ec2.get('ETL_AMI', None) SECURITY_GROUP = config.ec2.get('SECURITY_GROUP', None) KEY_PAIR = config.etl.get('KEY_PAIR', None) @@ -29,7 +28,7 @@ def __init__(self, s3_log_dir=None, schedule=None, terminate_after='6 Hours', - instance_type=DEFAULT_EC2_INSTANCE_TYPE, + instance_type=INSTANCE_TYPE, ami=ETL_AMI, security_group=SECURITY_GROUP, **kwargs): @@ -62,8 +61,8 @@ def __init__(self, schedule=schedule, imageId=ami, instanceType=instance_type, - role=DEFAULT_ROLE, - resourceRole=DEFAULT_RESOURCE_ROLE, + role=ROLE, + resourceRole=RESOURCE_ROLE, keyPair=KEY_PAIR, retryDelay=RETRY_DELAY, securityGroups=security_group diff --git a/dataduct/pipeline/emr_activity.py b/dataduct/pipeline/emr_activity.py index c1a2e2c..5d290de 100644 --- a/dataduct/pipeline/emr_activity.py +++ b/dataduct/pipeline/emr_activity.py @@ -8,7 +8,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) class EmrActivity(Activity): @@ -46,7 +46,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(EmrActivity, self).__init__( id=id, diff --git a/dataduct/pipeline/emr_resource.py b/dataduct/pipeline/emr_resource.py index 5abb5b6..a5f6ff4 100644 --- a/dataduct/pipeline/emr_resource.py +++ b/dataduct/pipeline/emr_resource.py @@ -9,18 +9,16 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_NUM_CORE_INSTANCES = config.emr.get('DEFAULT_NUM_CORE_INSTANCES', None) -DEFAULT_CORE_INSTANCE_TYPE = config.emr.get('DEFAULT_CORE_INSTANCE_TYPE', 'm1.large') -DEFAULT_TASK_INSTANCE_BID_PRICE = config.emr.get( - 'DEFAULT_TASK_INSTANCE_BID_PRICE', None) -DEFAULT_TASK_INSTANCE_TYPE = config.emr.get('DEFAULT_TASK_INSTANCE_TYPE', 'm1.large') -DEFAULT_MASTER_INSTANCE_TYPE = config.emr.get( - 'DEFAULT_MASTER_INSTANCE_TYPE', 'm1.large') -DEFAULT_CLUSTER_TIMEOUT = config.emr.get('DEFAULT_CLUSTER_TIMEOUT', '6 Hours') -DEFAULT_HADOOP_VERSION = config.emr.get('DEFAULT_HADOOP_VERSION', None) -DEFAULT_HIVE_VERSION = config.emr.get('DEFAULT_HIVE_VERSION', None) -DEFAULT_PIG_VERSION = config.emr.get('DEFAULT_PIG_VERSION', None) -DEFAULT_CLUSTER_AMI = config.emr.get('DEFAULT_CLUSTER_AMI', '2.4.7') +NUM_CORE_INSTANCES = config.emr.get('NUM_CORE_INSTANCES', None) +CORE_INSTANCE_TYPE = config.emr.get('CORE_INSTANCE_TYPE', 'm1.large') +TASK_INSTANCE_BID_PRICE = config.emr.get('TASK_INSTANCE_BID_PRICE', None) +TASK_INSTANCE_TYPE = config.emr.get('TASK_INSTANCE_TYPE', 'm1.large') +MASTER_INSTANCE_TYPE = config.emr.get('MASTER_INSTANCE_TYPE', 'm1.large') +CLUSTER_TIMEOUT = config.emr.get('CLUSTER_TIMEOUT', '6 Hours') +HADOOP_VERSION = config.emr.get('HADOOP_VERSION', None) +HIVE_VERSION = config.emr.get('HIVE_VERSION', None) +PIG_VERSION = config.emr.get('PIG_VERSION', None) +CLUSTER_AMI = config.emr.get('CLUSTER_AMI', '2.4.7') KEY_PAIR = config.etl.get('KEY_PAIR', None) @@ -32,18 +30,18 @@ def __init__(self, id, s3_log_dir, schedule, - num_instances=DEFAULT_NUM_CORE_INSTANCES, - instance_size=DEFAULT_CORE_INSTANCE_TYPE, + num_instances=NUM_CORE_INSTANCES, + instance_size=CORE_INSTANCE_TYPE, bootstrap=None, num_task_instances=None, - task_bid_price=DEFAULT_TASK_INSTANCE_BID_PRICE, - task_instance_type=DEFAULT_TASK_INSTANCE_TYPE, - master_instance_size=DEFAULT_MASTER_INSTANCE_TYPE, - terminate_after=DEFAULT_CLUSTER_TIMEOUT, - hadoop_version=DEFAULT_HADOOP_VERSION, - install_hive=DEFAULT_HIVE_VERSION, - install_pig=DEFAULT_PIG_VERSION, - ami_version=DEFAULT_CLUSTER_AMI): + task_bid_price=TASK_INSTANCE_BID_PRICE, + task_instance_type=TASK_INSTANCE_TYPE, + master_instance_size=MASTER_INSTANCE_TYPE, + terminate_after=CLUSTER_TIMEOUT, + hadoop_version=HADOOP_VERSION, + install_hive=HIVE_VERSION, + install_pig=PIG_VERSION, + ami_version=CLUSTER_AMI): """Constructor for the Ec2Resource class Args: diff --git a/dataduct/pipeline/redshift_copy_activity.py b/dataduct/pipeline/redshift_copy_activity.py index 18cc1ff..449cde6 100644 --- a/dataduct/pipeline/redshift_copy_activity.py +++ b/dataduct/pipeline/redshift_copy_activity.py @@ -9,7 +9,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') @@ -49,7 +49,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES kwargs = { 'id': id, diff --git a/dataduct/pipeline/shell_command_activity.py b/dataduct/pipeline/shell_command_activity.py index 953828d..0c47fb8 100644 --- a/dataduct/pipeline/shell_command_activity.py +++ b/dataduct/pipeline/shell_command_activity.py @@ -9,7 +9,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') @@ -57,7 +57,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(ShellCommandActivity, self).__init__( id=id, diff --git a/dataduct/pipeline/sns_alarm.py b/dataduct/pipeline/sns_alarm.py index c822bab..395fecd 100644 --- a/dataduct/pipeline/sns_alarm.py +++ b/dataduct/pipeline/sns_alarm.py @@ -7,7 +7,7 @@ config = Config() SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) -DEFAULT_ROLE = config.etl['DEFAULT_ROLE'] +ROLE = config.etl['ROLE'] class SNSAlarm(PipelineObject): @@ -48,7 +48,7 @@ def __init__(self, id=id, type='SnsAlarm', topicArn=topic_arn, - role=DEFAULT_ROLE, + role=ROLE, subject='Data Pipeline Failure', message=failure_message, ) diff --git a/dataduct/pipeline/sql_activity.py b/dataduct/pipeline/sql_activity.py index 95407b9..cecadc1 100644 --- a/dataduct/pipeline/sql_activity.py +++ b/dataduct/pipeline/sql_activity.py @@ -10,7 +10,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') @@ -54,7 +54,7 @@ def __init__(self, if depends_on is None: depends_on = [] if max_retries is None: - max_retries = DEFAULT_MAX_RETRIES + max_retries = MAX_RETRIES super(SqlActivity, self).__init__( id=id, diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 907135f..80ad625 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -11,7 +11,7 @@ from ..utils.exceptions import ETLInputError config = Config() -DEFAULT_MAX_RETRIES = config.etl.get('DEFAULT_MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) class ETLStep(object): @@ -32,7 +32,7 @@ class ETLStep(object): def __init__(self, id, s3_data_dir=None, s3_log_dir=None, s3_source_dir=None, schedule=None, resource=None, input_node=None, required_steps=None, - max_retries=DEFAULT_MAX_RETRIES): + max_retries=MAX_RETRIES): """Constructor for the ETLStep object Args: diff --git a/docs/installation.rst b/docs/installation.rst index dae6e2c..6d90ea5 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -32,24 +32,24 @@ environment variable pointing to the config file location by setting the # Constants that are used across the dataduct library ec2: - DEFAULT_ROLE: FILL_ME_IN - DEFAULT_RESOURCE_ROLE: FILL_ME_IN - DEFAULT_EC2_INSTANCE_TYPE: m1.large + ROLE: FILL_ME_IN + RESOURCE_ROLE: FILL_ME_IN + INSTANCE_TYPE: m1.large ETL_AMI: ami-05355a6c # Default AMI used by data pipeline KEY_PAIR: FILL_ME_IN SECURITY_GROUP: FILL_ME_IN emr: - DEFAULT_NUM_CORE_INSTANCES: 3 - DEFAULT_CORE_INSTANCE_TYPE: m1.large - DEFAULT_TASK_INSTANCE_BID_PRICE: null # null if we want it to be None - DEFAULT_TASK_INSTANCE_TYPE: m1.large - DEFAULT_MASTER_INSTANCE_TYPE: m1.large - DEFAULT_CLUSTER_TIMEOUT: 6 Hours - DEFAULT_HADOOP_VERSION: null - DEFAULT_HIVE_VERSION: null - DEFAULT_PIG_VERSION: null - DEFAULT_CLUSTER_AMI: 2.4.7 + NUM_CORE_INSTANCES: 3 + CORE_INSTANCE_TYPE: m1.large + TASK_INSTANCE_BID_PRICE: null # null if we want it to be None + TASK_INSTANCE_TYPE: m1.large + MASTER_INSTANCE_TYPE: m1.large + CLUSTER_TIMEOUT: 6 Hours + HADOOP_VERSION: null + HIVE_VERSION: null + PIG_VERSION: null + CLUSTER_AMI: 2.4.7 redshift: DATABASE_NAME: FILL_ME_IN @@ -65,7 +65,7 @@ environment variable pointing to the config file location by setting the etl: RETRY_DELAY: 10 Minutes - DEFAULT_MAX_RETRIES: 0 + MAX_RETRIES: 0 S3_ETL_BUCKET: FILL_ME_IN SNS_TOPIC_ARN_FAILURE: FILL_ME_IN SNS_TOPIC_ARN_WARNING: FILL_ME_IN From 77910ec735359fd8ba637c4ab563aba53bf8a18c Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 31 Dec 2014 14:47:49 -0800 Subject: [PATCH 021/175] simplify steps --- dataduct/etl/etl_pipeline.py | 21 ++------------- examples/scripts/custom_extract_local.py | 34 ++++-------------------- 2 files changed, 7 insertions(+), 48 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 3242059..c860425 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -91,7 +91,7 @@ def __init__(self, name, frequency='one-time', if bootstrap is not None: self.bootstrap_definitions = bootstrap - elif hasattr(config, 'bootstrap'): + elif getattr(config, 'bootstrap', None): self.bootstrap_definitions = config.bootstrap else: self.bootstrap_definitions = dict() @@ -390,12 +390,9 @@ def translate_input_nodes(self, input_node): def get_custom_steps(): """Fetch the custom steps specified in config """ - if not hasattr(config, 'custom_steps'): - return dict() - custom_steps = dict() - for step_def in config.custom_steps: + for step_def in getattr(config, 'custom_steps', list()): step_type = step_def['step_type'] path = parse_path(step_def['file_path'], CUSTOM_STEPS_PATH) @@ -525,16 +522,6 @@ def create_steps(self, steps_params, is_bootstrap=False): steps.append(step) return steps - def allocate_resource(self, resource_type): - """Allocate the resource object based on the resource type specified - """ - if resource_type == const.EMR_CLUSTER_STR: - return self.emr_cluster - elif resource_type == EC2_RESOURCE_STR: - return self.ec2_resource - else: - raise ETLInputError('Unknown resource type found') - def create_bootstrap_steps(self, resource_type): """Create the boostrap steps for installation on all machines @@ -543,10 +530,6 @@ def create_bootstrap_steps(self, resource_type): can be ec2 / emr """ step_params = self.bootstrap_definitions.get(resource_type, list()) - for step_param in step_params: - # Mutating the steps here by adding resource - step_param['resource'] = self.allocate_resource(resource_type) - steps = self.create_steps(step_params, True) self._bootstrap_steps.extend(steps) return steps diff --git a/examples/scripts/custom_extract_local.py b/examples/scripts/custom_extract_local.py index 8ddbb64..4acb4c7 100644 --- a/examples/scripts/custom_extract_local.py +++ b/examples/scripts/custom_extract_local.py @@ -1,42 +1,18 @@ """ ETL step wrapper for creating an S3 node for input from local files """ -from dataduct.steps import ETLStep -from dataduct.s3 import S3File -from dataduct.utils.exceptions import ETLInputError +from dataduct.steps import ExtractLocalStep -class CustomExtractLocalStep(ETLStep): - """ExtractLocal Step class that helps get data from a local file +class CustomExtractLocalStep(ExtractLocalStep): + """CustomExtractLocal Step class that helps get data from a local file """ - def __init__(self, path, **kwargs): - """Constructor for the ExtractLocalStep class + def __init__(self, **kwargs): + """Constructor for the CustomExtractLocal class Args: - path(str): local path for data **kwargs(optional): Keyword arguments directly passed to base class """ print 'Using the Custom Extract Local Step' - super(CustomExtractLocalStep, self).__init__(**kwargs) - self._output = self.create_s3_data_node(s3_object=S3File(path=path)) - - - @classmethod - def arguments_processor(cls, etl, input_args): - """Parse the step arguments according to the ETL pipeline - - Args: - etl(ETLPipeline): Pipeline object containing resources and steps - step_args(dict): Dictionary of the step arguments for the class - """ - input_args.pop('input_node', None) - step_args = cls.base_arguments_processor(etl, input_args) - - step_args.pop('resource') - if etl.frequency != 'one-time': - raise ETLInputError( - 'Custom Extract Local can be used for one-time pipelines only') - - return step_args From 281fc6d8785510a3b2b8c2dac7276ef14a332879 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 31 Dec 2014 17:18:38 -0800 Subject: [PATCH 022/175] streaming inputs cleanup --- dataduct/pipeline/emr_activity.py | 2 ++ dataduct/steps/emr_streaming.py | 38 +++++++++-------------------- examples/example_emr_streaming.yaml | 2 +- 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/dataduct/pipeline/emr_activity.py b/dataduct/pipeline/emr_activity.py index 5d290de..cea06a2 100644 --- a/dataduct/pipeline/emr_activity.py +++ b/dataduct/pipeline/emr_activity.py @@ -19,6 +19,7 @@ def __init__(self, id, resource, schedule, + input_node, emr_step_string, output_node=None, additional_files=None, @@ -57,6 +58,7 @@ def __init__(self, schedule=schedule, step=emr_step_string, output=output_node, + input=input_node, ) self.add_additional_files(additional_files) diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 73f5f1f..d6f5b5d 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -48,7 +48,7 @@ def create_command_hadoop_2(mapper, reducer, command, command_options): return ','.join(command) -def create_command(mapper, reducer, ami_version, input_uri, output, +def create_command(mapper, reducer, ami_version, input, output, hadoop_params): """Create the command step string given the input to streaming step """ @@ -66,13 +66,7 @@ def create_command(mapper, reducer, ami_version, input_uri, output, command_options.extend(['-output', output.path().uri]) # Add input uri - if isinstance(input_uri, list): - for i in input_uri: - assert isinstance(i, S3Path) - command_options.extend(['-input', i.uri]) - else: - assert isinstance(input_uri, S3Path), type(input_uri) - command_options.extend(['-input', input_uri.uri]) + command_options.extend(['-input', input.path().uri]) if ami_family in HADOOP_1_SERIES: return create_command_hadoop_1(mapper, reducer, command, @@ -89,7 +83,7 @@ class EMRStreamingStep(ETLStep): def __init__(self, mapper, reducer=None, - input=None, + input_path=None, hadoop_params=None, depends_on=None, **kwargs): @@ -105,19 +99,23 @@ def __init__(self, # As EMR streaming allows inputs as both input_node and input # We remove the default input_node if input is given - if input is not None: + if input_path is not None: input_node = kwargs.pop('input_node', None) else: input_node = kwargs.get('input_node', None) - if input is not None and 'input_node' in kwargs: - raise ETLInputError('Both input and input_node specified') + if input_path is not None and 'input_node' in kwargs: + raise ETLInputError('Both input_path and input_node specified') super(EMRStreamingStep, self).__init__(**kwargs) + if input_path is not None: + input_node = self.create_s3_data_node(S3Path(uri=input_path)) + if depends_on is not None: self._depends_on = depends_on + self._input = input_node self._output = self.create_s3_data_node() # Create S3File with script / command provided @@ -128,25 +126,13 @@ def __init__(self, reducer = self.create_script(S3File(path=reducer)) additional_files.append(reducer) - if input is not None: - if isinstance(input, list): - input = [S3Path(uri=i) for i in input] - else: - input = S3Path(uri=input) - else: - if isinstance(input_node, list): - input = [i.path() for i in input_node] - elif isinstance(input_node, dict): - input = [i.path() for i in input_node.values()] - else: - input = input_node.path() - step_string = create_command(mapper, reducer, self.resource.ami_version, - input, self._output, hadoop_params) + self._input, self._output, hadoop_params) self.activity = self.create_pipeline_object( object_class=EmrActivity, resource=self.resource, + input_node=input_node, schedule=self.schedule, emr_step_string=step_string, output_node=self._output, diff --git a/examples/example_emr_streaming.yaml b/examples/example_emr_streaming.yaml index 0bf29e9..6d14fdd 100644 --- a/examples/example_emr_streaming.yaml +++ b/examples/example_emr_streaming.yaml @@ -3,7 +3,7 @@ frequency : one-time load_time: 01:00 # Hour:Min in UTC emr_cluster_config: num_instances: 1 - instance_size: m1.xlarge + instance_size: m1.large ami_version: 3.3.1 description : Example for the emr_streaming step From 11142ebccddd0cfd4e923d1c2b8b349c72bfa269 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 1 Jan 2015 02:14:45 -0800 Subject: [PATCH 023/175] input_path for nodes --- dataduct/etl/etl_pipeline.py | 3 ++- dataduct/steps/emr_streaming.py | 20 ++------------------ dataduct/steps/etl_step.py | 22 ++++++++++++++++++---- dataduct/steps/extract_local.py | 2 +- dataduct/steps/extract_rds.py | 2 +- dataduct/steps/extract_redshift.py | 2 +- dataduct/steps/extract_s3.py | 2 +- dataduct/steps/qa_transform.py | 2 +- dataduct/steps/sql_command.py | 2 +- 9 files changed, 28 insertions(+), 29 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index c860425..7ea06aa 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -502,7 +502,8 @@ def create_steps(self, steps_params, is_bootstrap=False): # Assume that the preceding step is the input if not specified if isinstance(input_node, S3Node) and \ - 'input_node' not in step_param: + 'input_node' not in step_param and \ + 'input_path' not in step_param: step_param['input_node'] = input_node step_class, step_args = self.parse_step_args(**step_param) diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index d6f5b5d..9191236 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -83,7 +83,6 @@ class EMRStreamingStep(ETLStep): def __init__(self, mapper, reducer=None, - input_path=None, hadoop_params=None, depends_on=None, **kwargs): @@ -96,26 +95,11 @@ def __init__(self, hadoop_params(list of str): arguments to the hadoop command **kwargs(optional): Keyword arguments directly passed to base class """ - - # As EMR streaming allows inputs as both input_node and input - # We remove the default input_node if input is given - if input_path is not None: - input_node = kwargs.pop('input_node', None) - else: - input_node = kwargs.get('input_node', None) - - if input_path is not None and 'input_node' in kwargs: - raise ETLInputError('Both input_path and input_node specified') - super(EMRStreamingStep, self).__init__(**kwargs) - if input_path is not None: - input_node = self.create_s3_data_node(S3Path(uri=input_path)) - if depends_on is not None: self._depends_on = depends_on - self._input = input_node self._output = self.create_s3_data_node() # Create S3File with script / command provided @@ -127,12 +111,12 @@ def __init__(self, additional_files.append(reducer) step_string = create_command(mapper, reducer, self.resource.ami_version, - self._input, self._output, hadoop_params) + self.input, self.output, hadoop_params) self.activity = self.create_pipeline_object( object_class=EmrActivity, resource=self.resource, - input_node=input_node, + input_node=self.input, schedule=self.schedule, emr_step_string=step_string, output_node=self._output, diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 80ad625..dd74a47 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -31,7 +31,7 @@ class ETLStep(object): def __init__(self, id, s3_data_dir=None, s3_log_dir=None, s3_source_dir=None, schedule=None, resource=None, - input_node=None, required_steps=None, + input_node=None, input_path=None, required_steps=None, max_retries=MAX_RETRIES): """Constructor for the ETLStep object @@ -53,14 +53,19 @@ def __init__(self, id, s3_data_dir=None, s3_log_dir=None, self.resource = resource self.max_retries = max_retries self._depends_on = list() - self._input = input_node + self._input = None self._output = None self._objects = dict() self._required_steps = list() - self._activities = list() self._input_node = input_node + if input_path is not None and input_node is not None: + raise ETLInputError('Both input_path and input_node specified') + + if input_path is not None: + self._input_node = self.create_s3_data_node(S3Path(uri=input_path)) + if isinstance(input_node, list): if len(input_node) == 0: input_node = None @@ -268,7 +273,7 @@ def input(self): Note: Input is represented as None, a single node or dict of nodes """ - return self._input + return self._input_node @property def output(self): @@ -425,3 +430,12 @@ def arguments_processor(cls, etl, input_args): """ step_args = cls.base_arguments_processor(etl, input_args) return step_args + + @staticmethod + def pop_inputs(input_args): + """Remove the input nodes from the arguments dictionary + """ + input_args.pop('input_node', None) + input_args.pop('input_path', None) + + return input_args diff --git a/dataduct/steps/extract_local.py b/dataduct/steps/extract_local.py index 635e1bc..305dd1f 100644 --- a/dataduct/steps/extract_local.py +++ b/dataduct/steps/extract_local.py @@ -29,7 +29,7 @@ def arguments_processor(cls, etl, input_args): etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - input_args.pop('input_node', None) + input_args = cls.pop_inputs(input_args) step_args = cls.base_arguments_processor(etl, input_args) step_args.pop('resource') diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index 9cb9539..afa1df8 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -125,7 +125,7 @@ def arguments_processor(cls, etl, input_args): etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - input_args.pop('input_node', None) + input_args = cls.pop_inputs(input_args) step_args = cls.base_arguments_processor(etl, input_args) step_args['resource'] = etl.ec2_resource diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index e60a808..c4b6e23 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -62,7 +62,7 @@ def arguments_processor(cls, etl, input_args): etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - input_args.pop('input_node', None) + input_args = cls.pop_inputs(input_args) step_args = cls.base_arguments_processor(etl, input_args) step_args['redshift_database'] = etl.redshift_database step_args['resource'] = etl.ec2_resource diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index 14c485f..1b71c91 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -27,7 +27,7 @@ def arguments_processor(cls, etl, input_args): etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - input_args.pop('input_node', None) + input_args = cls.pop_inputs(input_args) step_args = cls.base_arguments_processor(etl, input_args) step_args.pop('resource') diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index 138eee7..8987ca8 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -49,7 +49,7 @@ def arguments_processor(cls, etl, input_args): etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - input_args.pop('input_node', None) + input_args = cls.pop_inputs(input_args) step_args = cls.base_arguments_processor(etl, input_args) step_args['pipeline_name'] = etl.name step_args['resource'] = etl.ec2_resource diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 2c6b8cb..267a645 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -64,7 +64,7 @@ def arguments_processor(cls, etl, input_args): etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - input_args.pop('input_node', None) + input_args = cls.pop_inputs(input_args) step_args = cls.base_arguments_processor(etl, input_args) step_args['redshift_database'] = etl.redshift_database step_args['resource'] = etl.ec2_resource From 5796eec81343dd30fe28381c6a78ccaed98abb43 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 1 Jan 2015 10:46:48 -0800 Subject: [PATCH 024/175] custom emr activity step --- dataduct/etl/etl_pipeline.py | 4 +++ dataduct/pipeline/emr_activity.py | 3 +- dataduct/steps/__init__.py | 1 + dataduct/steps/emr_job.py | 53 +++++++++++++++++++++++++++++++ dataduct/steps/emr_streaming.py | 5 +-- dataduct/steps/etl_step.py | 3 ++ 6 files changed, 64 insertions(+), 5 deletions(-) create mode 100644 dataduct/steps/emr_job.py diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 7ea06aa..dbf882d 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -18,6 +18,7 @@ from ..pipeline.utils import list_pipelines from ..steps import ETLStep +from ..steps import EMRJobStep from ..steps import EMRStreamingStep from ..steps import ExtractLocalStep from ..steps import ExtractRdsStep @@ -449,6 +450,9 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'emr-streaming': step_class = EMRStreamingStep + elif step_type == 'emr-step': + step_class = EMRJobStep + elif step_type == 'load-redshift': step_class = LoadRedshiftStep diff --git a/dataduct/pipeline/emr_activity.py b/dataduct/pipeline/emr_activity.py index cea06a2..53824fc 100644 --- a/dataduct/pipeline/emr_activity.py +++ b/dataduct/pipeline/emr_activity.py @@ -61,4 +61,5 @@ def __init__(self, input=input_node, ) - self.add_additional_files(additional_files) + if additional_files: + self.add_additional_files(additional_files) diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index d7ca553..c2d3fa4 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -1,5 +1,6 @@ from etl_step import ETLStep from emr_streaming import EMRStreamingStep +from emr_job import EMRJobStep from extract_local import ExtractLocalStep from extract_rds import ExtractRdsStep from extract_redshift import ExtractRedshiftStep diff --git a/dataduct/steps/emr_job.py b/dataduct/steps/emr_job.py new file mode 100644 index 0000000..963d453 --- /dev/null +++ b/dataduct/steps/emr_job.py @@ -0,0 +1,53 @@ +""" +ETL step wrapper for EmrActivity can be executed on EMR Cluster +""" +from .etl_step import ETLStep +from ..pipeline import EmrActivity + + +class EMRJobStep(ETLStep): + """EMR Step class that helps run a step on the emr cluster + """ + + def __init__(self, + step_string, + depends_on=None, + **kwargs): + """Constructor for the EMRJobStep class + + Args: + step_string(str): Step string for the emr job to be executed + **kwargs(optional): Keyword arguments directly passed to base class + + Note: + In the step_string all comma within arguments should be escaped + using 4 backslashes + """ + super(EMRJobStep, self).__init__(**kwargs) + + if depends_on is not None: + self._depends_on = depends_on + + self.activity = self.create_pipeline_object( + object_class=EmrActivity, + resource=self.resource, + input_node=self.input, + schedule=self.schedule, + emr_step_string=step_string, + output_node=self._output, + depends_on=self.depends_on, + max_retries=self.max_retries + ) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.emr_cluster + + return step_args diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 9191236..1678bb9 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -1,12 +1,9 @@ """ -ETL step wrapper for EmrActivity can be executed on Ec2 +ETL step wrapper for EmrStreamingActivity can be executed on EMR Cluster """ from .etl_step import ETLStep from ..pipeline import EmrActivity from ..s3 import S3File -from ..s3 import S3Path -from ..utils.exceptions import ETLInputError - HADOOP_1_SERIES = ['1', '2'] diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index dd74a47..fe35840 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -369,6 +369,9 @@ def base_arguments_processor(cls, etl, input_args): # Add dependencies depends_on = step_args.pop('depends_on', None) + if isinstance(depends_on, str): + depends_on = [depends_on] + if depends_on: for step_id in list(depends_on): if step_id not in etl.steps: From 399d12e07b55a5f5130aacccd0eab9de3429b164 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 2 Jan 2015 00:09:46 -0800 Subject: [PATCH 025/175] output path and node options --- dataduct/steps/etl_step.py | 6 +++--- dataduct/steps/transform.py | 17 +++++++++-------- examples/example_double_output.yaml | 10 +++++----- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index fe35840..f7d8e6c 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -255,10 +255,10 @@ def merge_s3_nodes(self, input_nodes): """ depends_on = list() combined_node = self.create_s3_data_node() - for input_node in input_nodes: - dest_uri = S3Path(key=input_node, is_directory=True, + for string_key, input_node in input_nodes.iteritems(): + dest_uri = S3Path(key=string_key, is_directory=True, parent_dir=combined_node.path()) - copy_activity = self.copy_s3(input_node=input_nodes[input_node], + copy_activity = self.copy_s3(input_node=input_node, dest_uri=dest_uri) depends_on.append(copy_activity) return combined_node, depends_on diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 88d7b61..8e35b1d 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -18,7 +18,7 @@ class TransformStep(ETLStep): def __init__(self, command=None, script=None, - output=None, + output_node=None, script_arguments=None, additional_s3_files=None, depends_on=None, @@ -28,7 +28,7 @@ def __init__(self, Args: command(str): command to be executed directly script(path): local path to the script that should executed - output(dict): output data nodes from the transform + output_node(dict): output data nodes from the transform script_arguments(list of str): list of arguments to the script additional_s3_files(list of S3File): additional files used **kwargs(optional): Keyword arguments directly passed to base class @@ -43,9 +43,9 @@ def __init__(self, # Create output_node if not provided if self._output is None: - output_node = self.create_s3_data_node() + base_output_node = self.create_s3_data_node() else: - output_node = self._output + base_output_node = self._output # Create S3File if script path provided if script: @@ -56,7 +56,7 @@ def __init__(self, self.create_pipeline_object( object_class=ShellCommandActivity, input_node=self._input_node, - output_node=output_node, + output_node=base_output_node, resource=self.resource, schedule=self.schedule, script_uri=script, @@ -69,10 +69,11 @@ def __init__(self, # Translate output nodes if output map provided if self._output is None: - if output: - self._output = self.create_output_nodes(output_node, output) + if output_node: + self._output = self.create_output_nodes( + base_output_node, output_node) else: - self._output = output_node + self._output = base_output_node def translate_arguments(self, script_arguments): """Translate script argument to lists diff --git a/examples/example_double_output.yaml b/examples/example_double_output.yaml index 968486a..e55d943 100644 --- a/examples/example_double_output.yaml +++ b/examples/example_double_output.yaml @@ -18,23 +18,23 @@ steps: input_node: step1_a: step2_a step1_b: step2_b - output: + output_node: - step2_a - step2_b - step_type: transform + name: profiler_1 script: examples/scripts/s3_profiler.py - input_node: - step2_a: output1 + input_node: step2_a script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR - -f - step_type: transform + name: profiler_2 script: examples/scripts/s3_profiler.py - input_node: - step2_b : output1 + input_node: step2_b script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR From a1a7172e5b47f2ebbcc0a838a2405412dfe852f2 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 2 Jan 2015 01:29:11 -0800 Subject: [PATCH 026/175] node dependencies --- dataduct/etl/etl_actions.py | 4 ++++ dataduct/pipeline/s3_node.py | 14 ++++++++++++++ dataduct/steps/etl_step.py | 31 ++++++++++++++----------------- dataduct/steps/extract_local.py | 3 +-- dataduct/steps/extract_s3.py | 2 +- 5 files changed, 34 insertions(+), 20 deletions(-) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 5b5568d..211fd09 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -143,6 +143,10 @@ def visualize_pipeline(etl, filename=None): for dependency in dependencies: graph.add_edge(dependency.id, p_object.id, color='blue') + if isinstance(p_object, S3Node): + for dependency in p_object.dependency_nodes: + graph.add_edge(dependency.id, p_object.id, color='grey') + # Plotting the graph with dot layout graph.layout(prog='dot') graph.draw(filename) diff --git a/dataduct/pipeline/s3_node.py b/dataduct/pipeline/s3_node.py index 34d1f86..e8d365f 100644 --- a/dataduct/pipeline/s3_node.py +++ b/dataduct/pipeline/s3_node.py @@ -60,6 +60,9 @@ def __init__(self, # Save the s3_path variable self._s3_path = s3_path + # Save the dependent nodes from the S3 Node + self._dependency_nodes = list() + super(S3Node, self).__init__( id=id, retryDelay=RETRY_DELAY, @@ -81,3 +84,14 @@ def path(self): return self._s3_path.s3_path else: return self._s3_path + + @property + def dependency_nodes(self): + """Fetch the dependent nodes for the S3 node + """ + return self._dependency_nodes + + def add_dependency_node(self, input_node): + """Add nodes to the list of dependencies among S3 Nodes + """ + self._dependency_nodes.append(input_node) diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index f7d8e6c..4d24586 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -179,12 +179,16 @@ def create_output_nodes(self, output_node, sub_dirs): Returns: s3_output_nodes(dict of s3Node): Output nodes keyed with sub dirs """ - return dict( - ( - sub_dir, - self.create_s3_data_node(S3Path(sub_dir, is_directory=True, - parent_dir=output_node.path())) - ) for sub_dir in sub_dirs) + output_map = dict() + for sub_dir in sub_dirs: + new_node = self.create_s3_data_node( + S3Path(sub_dir, is_directory=True, + parent_dir=output_node.path())) + new_node.add_dependency_node(output_node) + + output_map[sub_dir] = new_node + + return output_map def create_script(self, s3_object): """Set the s3 path for s3 objects with the s3_source_dir @@ -216,16 +220,6 @@ def copy_s3(self, input_node, dest_uri): if not(isinstance(input_node, S3Node) and isinstance(dest_uri, S3Path)): raise ETLInputError('input_node and uri have type mismatch') - # Copy the input node. We need to use directories for copying if we - # are going to omit the data format - if input_node.path().is_directory: - uri = input_node.path().uri - else: - uri = '/'.join(input_node.path().uri.split('/')[:-1]) - - new_input_node = self.create_s3_data_node( - s3_object=S3Path(uri=uri, is_directory=True)) - # create s3 node for output output_node = self.create_s3_data_node(dest_uri) @@ -234,7 +228,7 @@ def copy_s3(self, input_node, dest_uri): CopyActivity, schedule=self.schedule, resource=self.resource, - input_node=new_input_node, + input_node=input_node, output_node=output_node, max_retries=self.max_retries ) @@ -255,12 +249,15 @@ def merge_s3_nodes(self, input_nodes): """ depends_on = list() combined_node = self.create_s3_data_node() + for string_key, input_node in input_nodes.iteritems(): dest_uri = S3Path(key=string_key, is_directory=True, parent_dir=combined_node.path()) copy_activity = self.copy_s3(input_node=input_node, dest_uri=dest_uri) depends_on.append(copy_activity) + combined_node.add_dependency_node(copy_activity.output) + return combined_node, depends_on @property diff --git a/dataduct/steps/extract_local.py b/dataduct/steps/extract_local.py index 305dd1f..c017bbf 100644 --- a/dataduct/steps/extract_local.py +++ b/dataduct/steps/extract_local.py @@ -18,8 +18,7 @@ def __init__(self, path, **kwargs): **kwargs(optional): Keyword arguments directly passed to base class """ super(ExtractLocalStep, self).__init__(**kwargs) - self._output = self.create_s3_data_node(s3_object=S3File(path=path)) - + self._output = self.create_s3_data_node(S3File(path=path)) @classmethod def arguments_processor(cls, etl, input_args): diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index 1b71c91..bfe91cf 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -17,7 +17,7 @@ def __init__(self, uri, **kwargs): **kwargs(optional): Keyword arguments directly passed to base class """ super(ExtractS3Step, self).__init__(**kwargs) - self._output = self.create_s3_data_node(s3_object=S3Path(uri=uri)) + self._output = self.create_s3_data_node(S3Path(uri=uri)) @classmethod def arguments_processor(cls, etl, input_args): From 6c8428a9286e1688e929379573bf32d1e42e2780 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 2 Jan 2015 02:06:50 -0800 Subject: [PATCH 027/175] output path for steps --- dataduct/steps/emr_streaming.py | 18 +++--------------- dataduct/steps/etl_step.py | 10 ++++++++++ dataduct/steps/extract_local.py | 5 +++-- dataduct/steps/extract_rds.py | 6 ++++-- dataduct/steps/extract_redshift.py | 4 +++- dataduct/steps/transform.py | 21 ++++++++++----------- 6 files changed, 33 insertions(+), 31 deletions(-) diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 1678bb9..c200e2a 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -82,6 +82,7 @@ def __init__(self, reducer=None, hadoop_params=None, depends_on=None, + output_path=None, **kwargs): """Constructor for the EMRStreamingStep class @@ -97,7 +98,8 @@ def __init__(self, if depends_on is not None: self._depends_on = depends_on - self._output = self.create_s3_data_node() + self._output = self.create_s3_data_node( + self.get_output_s3_path(output_path)) # Create S3File with script / command provided mapper = self.create_script(S3File(path=mapper)) @@ -122,20 +124,6 @@ def __init__(self, max_retries=self.max_retries ) - def merge_s3_nodes(self, input_nodes): - """Override the merge S3Node case for EMR Streaming Step - - Args: - input_nodes(dict): Map of the form {'node_name': node} - - Returns: - output_node(list of S3Node): list of input nodes - depends_on(list): Empty list - """ - depends_on = [] - output_node = input_nodes.values() - return output_node, depends_on - @classmethod def arguments_processor(cls, etl, input_args): """Parse the step arguments according to the ETL pipeline diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 4d24586..8192eba 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -439,3 +439,13 @@ def pop_inputs(input_args): input_args.pop('input_path', None) return input_args + + @staticmethod + def get_output_s3_path(output_path): + """Create an S3 Path variable based on the output path + """ + if output_path: + s3_path = S3Path(uri=output_path) + else: + s3_path = None + return s3_path diff --git a/dataduct/steps/extract_local.py b/dataduct/steps/extract_local.py index c017bbf..1275358 100644 --- a/dataduct/steps/extract_local.py +++ b/dataduct/steps/extract_local.py @@ -10,7 +10,7 @@ class ExtractLocalStep(ETLStep): """ExtractLocal Step class that helps get data from a local file """ - def __init__(self, path, **kwargs): + def __init__(self, path, output_path=None, **kwargs): """Constructor for the ExtractLocalStep class Args: @@ -18,7 +18,8 @@ def __init__(self, path, **kwargs): **kwargs(optional): Keyword arguments directly passed to base class """ super(ExtractLocalStep, self).__init__(**kwargs) - self._output = self.create_s3_data_node(S3File(path=path)) + self._output = self.create_s3_data_node( + S3File(path=path, s3_path=self.get_output_s3_path(output_path))) @classmethod def arguments_processor(cls, etl, input_args): diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index afa1df8..6e55a79 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -38,6 +38,7 @@ def __init__(self, sql=None, host_name=None, database=None, + output_path=None, depends_on=None, **kwargs): """Constructor for the ExtractRdsStep class @@ -96,10 +97,11 @@ def __init__(self, max_retries=self.max_retries, ) + self._output = self.create_s3_data_node( + self.get_output_s3_path(output_path)) + # This shouldn't be necessary but - # AWS uses \\n as null, so we need to remove it - self._output = self.create_s3_data_node() - command = ' '.join(["cat", "${INPUT1_STAGING_DIR}/*", "| sed 's/\\\\\\\\n/NULL/g'", # replace \\n diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index c4b6e23..7d2541e 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -16,6 +16,7 @@ def __init__(self, redshift_database, insert_mode="TRUNCATE", depends_on=None, + output_path=None, **kwargs): """Constructor for the ExtractRedshiftStep class @@ -40,7 +41,8 @@ def __init__(self, table_name=table, ) - self._output = self.create_s3_data_node() + self._output = self.create_s3_data_node( + self.get_output_s3_path(output_path)) self.create_pipeline_object( object_class=RedshiftCopyActivity, diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 8e35b1d..47a47c8 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -11,6 +11,7 @@ SCRIPT_ARGUMENT_TYPE_STRING = 'string' SCRIPT_ARGUMENT_TYPE_SQL = 'sql' + class TransformStep(ETLStep): """Transform Step class that helps run scripts on resouces """ @@ -22,6 +23,7 @@ def __init__(self, script_arguments=None, additional_s3_files=None, depends_on=None, + output_path=None, **kwargs): """Constructor for the TransformStep class @@ -41,11 +43,9 @@ def __init__(self, if depends_on is not None: self._depends_on = depends_on - # Create output_node if not provided - if self._output is None: - base_output_node = self.create_s3_data_node() - else: - base_output_node = self._output + # Create output_node based on output_path + base_output_node = self.create_s3_data_node( + self.get_output_s3_path(output_path)) # Create S3File if script path provided if script: @@ -68,12 +68,11 @@ def __init__(self, ) # Translate output nodes if output map provided - if self._output is None: - if output_node: - self._output = self.create_output_nodes( - base_output_node, output_node) - else: - self._output = base_output_node + if output_node: + self._output = self.create_output_nodes( + base_output_node, output_node) + else: + self._output = base_output_node def translate_arguments(self, script_arguments): """Translate script argument to lists From 995d7e71d2d9576c08c44856c8cf836e27d7d018 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 2 Jan 2015 02:24:11 -0800 Subject: [PATCH 028/175] clean up stuff --- dataduct/pipeline/emr_activity.py | 3 +-- dataduct/pipeline/pipeline_object.py | 3 +++ dataduct/pipeline/shell_command_activity.py | 3 +-- dataduct/steps/emr_job.py | 2 +- dataduct/steps/emr_streaming.py | 2 +- dataduct/steps/etl_step.py | 1 - dataduct/steps/extract_rds.py | 2 +- dataduct/steps/extract_redshift.py | 4 ++-- dataduct/steps/load_redshift.py | 4 ++-- dataduct/steps/transform.py | 20 ++++++++++---------- 10 files changed, 22 insertions(+), 22 deletions(-) diff --git a/dataduct/pipeline/emr_activity.py b/dataduct/pipeline/emr_activity.py index 53824fc..cea06a2 100644 --- a/dataduct/pipeline/emr_activity.py +++ b/dataduct/pipeline/emr_activity.py @@ -61,5 +61,4 @@ def __init__(self, input=input_node, ) - if additional_files: - self.add_additional_files(additional_files) + self.add_additional_files(additional_files) diff --git a/dataduct/pipeline/pipeline_object.py b/dataduct/pipeline/pipeline_object.py index b84cd32..123bae0 100644 --- a/dataduct/pipeline/pipeline_object.py +++ b/dataduct/pipeline/pipeline_object.py @@ -118,6 +118,9 @@ def add_additional_files(self, new_files): Args: new_files(S3File): list of new S3 files for the activity """ + if new_files is None: + return + for new_file in new_files: if not isinstance(new_file, S3File): raise ETLInputError('File must be an S3 File object') diff --git a/dataduct/pipeline/shell_command_activity.py b/dataduct/pipeline/shell_command_activity.py index 0c47fb8..114f427 100644 --- a/dataduct/pipeline/shell_command_activity.py +++ b/dataduct/pipeline/shell_command_activity.py @@ -76,5 +76,4 @@ def __init__(self, ) # Add the additional s3 files - if additional_s3_files is not None: - self.add_additional_files(additional_s3_files) + self.add_additional_files(additional_s3_files) diff --git a/dataduct/steps/emr_job.py b/dataduct/steps/emr_job.py index 963d453..fb01b00 100644 --- a/dataduct/steps/emr_job.py +++ b/dataduct/steps/emr_job.py @@ -34,7 +34,7 @@ def __init__(self, input_node=self.input, schedule=self.schedule, emr_step_string=step_string, - output_node=self._output, + output_node=self.output, depends_on=self.depends_on, max_retries=self.max_retries ) diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index c200e2a..0b86455 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -118,7 +118,7 @@ def __init__(self, input_node=self.input, schedule=self.schedule, emr_step_string=step_string, - output_node=self._output, + output_node=self.output, additional_files=additional_files, depends_on=self.depends_on, max_retries=self.max_retries diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 8192eba..7811918 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -53,7 +53,6 @@ def __init__(self, id, s3_data_dir=None, s3_log_dir=None, self.resource = resource self.max_retries = max_retries self._depends_on = list() - self._input = None self._output = None self._objects = dict() self._required_steps = list() diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index 6e55a79..96ed99e 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -112,7 +112,7 @@ def __init__(self, self.create_pipeline_object( object_class=ShellCommandActivity, input_node=intermediate_node, - output_node=self._output, + output_node=self.output, command=command, max_retries=self.max_retries, resource=self.resource, diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index 7d2541e..3967dcf 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -47,8 +47,8 @@ def __init__(self, self.create_pipeline_object( object_class=RedshiftCopyActivity, max_retries=self.max_retries, - input_node=self._input_node, - output_node=self._output, + input_node=self.input, + output_node=self.output, insert_mode=insert_mode, resource=self.resource, schedule=self.schedule, diff --git a/dataduct/steps/load_redshift.py b/dataduct/steps/load_redshift.py index 320e616..870db70 100644 --- a/dataduct/steps/load_redshift.py +++ b/dataduct/steps/load_redshift.py @@ -55,8 +55,8 @@ def __init__(self, self.create_pipeline_object( object_class=RedshiftCopyActivity, max_retries=self.max_retries, - input_node=self._input_node, - output_node=self._output, + input_node=self.input, + output_node=self.output, insert_mode=insert_mode, resource=self.resource, schedule=self.schedule, diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 47a47c8..f0d5182 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -35,11 +35,11 @@ def __init__(self, additional_s3_files(list of S3File): additional files used **kwargs(optional): Keyword arguments directly passed to base class """ + super(TransformStep, self).__init__(**kwargs) + if not exactly_one(command, script): raise ETLInputError('Both command or script found') - super(TransformStep, self).__init__(**kwargs) - if depends_on is not None: self._depends_on = depends_on @@ -53,9 +53,16 @@ def __init__(self, script_arguments = self.translate_arguments(script_arguments) + # Translate output nodes if output map provided + if output_node: + self._output = self.create_output_nodes( + base_output_node, output_node) + else: + self._output = base_output_node + self.create_pipeline_object( object_class=ShellCommandActivity, - input_node=self._input_node, + input_node=self.input, output_node=base_output_node, resource=self.resource, schedule=self.schedule, @@ -67,13 +74,6 @@ def __init__(self, additional_s3_files=additional_s3_files, ) - # Translate output nodes if output map provided - if output_node: - self._output = self.create_output_nodes( - base_output_node, output_node) - else: - self._output = base_output_node - def translate_arguments(self, script_arguments): """Translate script argument to lists From ab595e196cb69fd35cf52c1e24dc6350ca245d03 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 2 Jan 2015 11:32:56 -0800 Subject: [PATCH 029/175] script directory support for transform step --- dataduct/pipeline/pipeline_object.py | 7 +--- dataduct/pipeline/s3_node.py | 30 +++++++-------- dataduct/steps/etl_step.py | 4 +- dataduct/steps/scripts/script_runner.py | 50 +++++++++++++++++++++++++ dataduct/steps/transform.py | 46 ++++++++++++++++++++--- dataduct/utils/constants.py | 1 + examples/example_transform.yaml | 10 +++++ 7 files changed, 121 insertions(+), 27 deletions(-) create mode 100644 dataduct/steps/scripts/script_runner.py diff --git a/dataduct/pipeline/pipeline_object.py b/dataduct/pipeline/pipeline_object.py index 123bae0..42af751 100644 --- a/dataduct/pipeline/pipeline_object.py +++ b/dataduct/pipeline/pipeline_object.py @@ -100,15 +100,12 @@ def __setitem__(self, key, value): key(str): Key of the item to be fetched value: Value of the item to be fetched """ - # Do not add none values - if value is None: - return - # Store value as a list if there is only one if not isinstance(value, list): value = [value] - self.fields[key].extend(value) + # Do not add none values + self.fields[key].extend([x for x in value if x is not None]) if key == 'dependsOn': self.fields[key] = list(set(self.fields[key])) diff --git a/dataduct/pipeline/s3_node.py b/dataduct/pipeline/s3_node.py index e8d365f..e9f10f7 100644 --- a/dataduct/pipeline/s3_node.py +++ b/dataduct/pipeline/s3_node.py @@ -23,7 +23,7 @@ class S3Node(PipelineObject): def __init__(self, id, schedule, - s3_path, + s3_object, precondition=None, format=None, **kwargs): @@ -32,7 +32,7 @@ def __init__(self, Args: id(str): id of the object schedule(Schedule): pipeline schedule - s3_path(S3Path / S3File / S3Directory): s3 location + s3_object(S3Path / S3File / S3Directory): s3 location precondition(Precondition): precondition to the data node **kwargs(optional): Keyword arguments directly passed to base class """ @@ -46,19 +46,19 @@ def __init__(self, raise ETLInputError( 'Input precondition must be of the type Precondition') - if not(isinstance(s3_path, S3Path) or - isinstance(s3_path, S3File) or - isinstance(s3_path, S3Directory)): + if not(isinstance(s3_object, S3Path) or + isinstance(s3_object, S3File) or + isinstance(s3_object, S3Directory)): raise ETLInputError('Mismatched type for S3 path') additional_args = {} - if isinstance(s3_path, S3Path) and s3_path.is_directory: - additional_args['directoryPath'] = s3_path + if isinstance(s3_object, S3Path) and s3_object.is_directory: + additional_args['directoryPath'] = s3_object else: - additional_args['filePath'] = s3_path + additional_args['filePath'] = s3_object - # Save the s3_path variable - self._s3_path = s3_path + # Save the s3_object variable + self._s3_object = s3_object # Save the dependent nodes from the S3 Node self._dependency_nodes = list() @@ -75,15 +75,15 @@ def __init__(self, def path(self): - """Get the s3_path associated with the S3 data node + """Get the s3_object associated with the S3 data node Returns: - s3_path(S3Path): The s3 path of the node can a directory or file + s3_object(S3Path): The s3 path of the node can a directory or file """ - if isinstance(self._s3_path, S3File): - return self._s3_path.s3_path + if isinstance(self._s3_object, S3File): + return self._s3_object.s3_path else: - return self._s3_path + return self._s3_object @property def dependency_nodes(self): diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 7811918..acf79d6 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -152,9 +152,9 @@ def create_s3_data_node(self, s3_object=None, **kwargs): s3_object = s3_dir s3_node = self.create_pipeline_object( - S3Node, + object_class=S3Node, schedule=self.schedule, - s3_path=s3_object, + s3_object=s3_object, **kwargs ) diff --git a/dataduct/steps/scripts/script_runner.py b/dataduct/steps/scripts/script_runner.py new file mode 100644 index 0000000..d87864f --- /dev/null +++ b/dataduct/steps/scripts/script_runner.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +""" +This script initiates the different calls needed when running +a transform step with the script_directory argument +""" + +# imports +import argparse +import os +import subprocess + + +def run_command(arguments): + """ + Args: + arguments(list of str): Arguments to be executed as a command. Arguments + are passed as if calling subprocess.call() directly + """ + return subprocess.call(arguments) + + +def main(): + """ + Parses the command line arguments and runs the suitable functions + """ + parser = argparse.ArgumentParser() + # Environment variable for the source directory + parser.add_argument('--INPUT_SRC_ENV_VAR', dest='input_src_env_var') + + # Argument for script name + parser.add_argument('--SCRIPT_NAME', dest='script_name') + args, ext_script_args = parser.parse_known_args() + + # Check if the source directory exists + input_src_dir = os.getenv(args.input_src_env_var) + if not os.path.exists(input_src_dir): + raise Exception(input_src_dir + " does not exist") + + run_command(['ls', '-l', input_src_dir]) + run_command(['chmod', '-R', '+x', input_src_dir]) + run_command(['ls', '-l', input_src_dir]) + + input_file = os.path.join(input_src_dir, args.script_name) + result = run_command([input_file] + ext_script_args) + if result != 0: + raise Exception("Script failed.") + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index f0d5182..0968b08 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -1,9 +1,13 @@ """ ETL step wrapper for shell command activity can be executed on Ec2 / EMR """ +import os + from .etl_step import ETLStep from ..pipeline import ShellCommandActivity +from ..pipeline import S3Node from ..s3 import S3File +from ..s3 import S3Directory from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError from ..utils import constants as const @@ -19,6 +23,8 @@ class TransformStep(ETLStep): def __init__(self, command=None, script=None, + script_directory=None, + script_name=None, output_node=None, script_arguments=None, additional_s3_files=None, @@ -30,6 +36,8 @@ def __init__(self, Args: command(str): command to be executed directly script(path): local path to the script that should executed + script_directory(path): local path to the script directory + script_name(str): script to be executed in the directory output_node(dict): output data nodes from the transform script_arguments(list of str): list of arguments to the script additional_s3_files(list of S3File): additional files used @@ -37,8 +45,9 @@ def __init__(self, """ super(TransformStep, self).__init__(**kwargs) - if not exactly_one(command, script): - raise ETLInputError('Both command or script found') + if not exactly_one(command, script, script_directory): + raise ETLInputError( + 'Only one of script, command and directory allowed') if depends_on is not None: self._depends_on = depends_on @@ -47,12 +56,39 @@ def __init__(self, base_output_node = self.create_s3_data_node( self.get_output_s3_path(output_path)) + script_arguments = self.translate_arguments(script_arguments) + + input_nodes = [self.input] + if script_directory: + # The script to be run with the directory + if script_name is None: + raise ETLInputError('script_name required with directory') + + script_directory = self.create_script( + S3Directory(path=script_directory)) + + # Input node for the source code in the directory + input_nodes.append(self.create_pipeline_object( + object_class=S3Node, + schedule=self.schedule, + s3_object=script_directory + )) + + # We need to create an additional script that later calls the main + # script as we need to change permissions of the input directory + ip_src_env = 'INPUT%d_STAGING_DIR' % (1 if not self.input else 2) + additional_args = ['--INPUT_SRC_ENV_VAR=%s' % ip_src_env, + '--SCRIPT_NAME=%s' % script_name] + + script_arguments = additional_args + script_arguments + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.SCRIPT_RUNNER_PATH) + # Create S3File if script path provided if script: script = self.create_script(S3File(path=script)) - script_arguments = self.translate_arguments(script_arguments) - # Translate output nodes if output map provided if output_node: self._output = self.create_output_nodes( @@ -62,7 +98,7 @@ def __init__(self, self.create_pipeline_object( object_class=ShellCommandActivity, - input_node=self.input, + input_node=input_nodes, output_node=base_output_node, resource=self.resource, schedule=self.schedule, diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index 261121c..f5023af 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -2,3 +2,4 @@ """ EMR_CLUSTER_STR = 'emr' +SCRIPT_RUNNER_PATH = 'scripts/script_runner.py' diff --git a/examples/example_transform.yaml b/examples/example_transform.yaml index 4ecda05..e82b1ee 100644 --- a/examples/example_transform.yaml +++ b/examples/example_transform.yaml @@ -6,10 +6,20 @@ description : Example for the transform step steps: - step_type: extract-local + name: extract-node path: examples/resources/test_table1.tsv - step_type: transform + input_node: extract-node script: examples/scripts/s3_profiler.py script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR + +- step_type: transform + input_node: extract-node + script_directory: examples/scripts/ + script_name: s3_profiler.py + script_arguments: + - --input=INPUT1_STAGING_DIR + - --output=OUTPUT1_STAGING_DIR From facd4f6878382240c979c18b385903397bd84ea6 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 2 Jan 2015 14:49:00 -0800 Subject: [PATCH 030/175] dependency step --- dataduct/etl/etl_pipeline.py | 10 +- dataduct/steps/__init__.py | 23 +-- dataduct/steps/emr_job.py | 4 - dataduct/steps/emr_streaming.py | 4 - dataduct/steps/etl_step.py | 14 +- dataduct/steps/extract_rds.py | 4 - dataduct/steps/extract_redshift.py | 4 - dataduct/steps/load_redshift.py | 4 - dataduct/steps/pipeline_dependencies.py | 71 +++++++++ dataduct/steps/qa_transform.py | 6 +- .../scripts/pipeline_dependency_check.py | 142 ++++++++++++++++++ dataduct/steps/sql_command.py | 4 - dataduct/steps/transform.py | 10 +- dataduct/utils/constants.py | 1 + examples/example_pipeline_dependency.yaml | 14 ++ 15 files changed, 265 insertions(+), 50 deletions(-) create mode 100644 dataduct/steps/pipeline_dependencies.py create mode 100644 dataduct/steps/scripts/pipeline_dependency_check.py create mode 100644 examples/example_pipeline_dependency.yaml diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index dbf882d..bcda822 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -25,6 +25,7 @@ from ..steps import ExtractRedshiftStep from ..steps import ExtractS3Step from ..steps import LoadRedshiftStep +from ..steps import PipelineDependenciesStep from ..steps import SqlCommandStep from ..steps import TransformStep from ..steps import QATransformStep @@ -453,6 +454,9 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'emr-step': step_class = EMRJobStep + elif step_type == 'pipeline-dependencies': + step_class = PipelineDependenciesStep + elif step_type == 'load-redshift': step_class = LoadRedshiftStep @@ -510,7 +514,11 @@ def create_steps(self, steps_params, is_bootstrap=False): 'input_path' not in step_param: step_param['input_node'] = input_node - step_class, step_args = self.parse_step_args(**step_param) + try: + step_class, step_args = self.parse_step_args(**step_param) + except Exception: + print 'Error creating step with params : ', step_param + raise try: step = step_class(**step_args) diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index c2d3fa4..6b8b81f 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -1,11 +1,12 @@ -from etl_step import ETLStep -from emr_streaming import EMRStreamingStep -from emr_job import EMRJobStep -from extract_local import ExtractLocalStep -from extract_rds import ExtractRdsStep -from extract_redshift import ExtractRedshiftStep -from extract_s3 import ExtractS3Step -from load_redshift import LoadRedshiftStep -from sql_command import SqlCommandStep -from transform import TransformStep -from qa_transform import QATransformStep +from .etl_step import ETLStep +from .emr_streaming import EMRStreamingStep +from .emr_job import EMRJobStep +from .extract_local import ExtractLocalStep +from .extract_rds import ExtractRdsStep +from .extract_redshift import ExtractRedshiftStep +from .extract_s3 import ExtractS3Step +from .load_redshift import LoadRedshiftStep +from .pipeline_dependencies import PipelineDependenciesStep +from .sql_command import SqlCommandStep +from .transform import TransformStep +from .qa_transform import QATransformStep diff --git a/dataduct/steps/emr_job.py b/dataduct/steps/emr_job.py index fb01b00..4158ec8 100644 --- a/dataduct/steps/emr_job.py +++ b/dataduct/steps/emr_job.py @@ -11,7 +11,6 @@ class EMRJobStep(ETLStep): def __init__(self, step_string, - depends_on=None, **kwargs): """Constructor for the EMRJobStep class @@ -25,9 +24,6 @@ def __init__(self, """ super(EMRJobStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - self.activity = self.create_pipeline_object( object_class=EmrActivity, resource=self.resource, diff --git a/dataduct/steps/emr_streaming.py b/dataduct/steps/emr_streaming.py index 0b86455..66f2156 100644 --- a/dataduct/steps/emr_streaming.py +++ b/dataduct/steps/emr_streaming.py @@ -81,7 +81,6 @@ def __init__(self, mapper, reducer=None, hadoop_params=None, - depends_on=None, output_path=None, **kwargs): """Constructor for the EMRStreamingStep class @@ -95,9 +94,6 @@ def __init__(self, """ super(EMRStreamingStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - self._output = self.create_s3_data_node( self.get_output_s3_path(output_path)) diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index acf79d6..c71debd 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -56,7 +56,7 @@ def __init__(self, id, s3_data_dir=None, s3_log_dir=None, self._output = None self._objects = dict() self._required_steps = list() - self._activities = list() + self._required_activities = list() self._input_node = input_node if input_path is not None and input_node is not None: @@ -95,14 +95,12 @@ def add_required_steps(self, required_steps): """ self._required_steps.extend(required_steps) - # Find all activities which need to be completed. - required_activities = [] - for step in self._required_steps: - required_activities.extend(step.activities) + for step in required_steps: + self._required_activities.extend(step.activities) # Set required_acitivites as depend_on variable of all activities for activity in self.activities: - activity['dependsOn'] = required_activities + activity['dependsOn'] = self._required_activities def create_pipeline_object(self, object_class, **kwargs): """Create the pipeline objects associated with the step @@ -123,6 +121,10 @@ def create_pipeline_object(self, object_class, **kwargs): str(instance_count) new_object = object_class(object_id, **kwargs) + + if isinstance(new_object, Activity): + new_object['dependsOn'] = self._required_activities + self._objects[object_id] = new_object return new_object diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index 96ed99e..6c8f3bf 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -39,7 +39,6 @@ def __init__(self, host_name=None, database=None, output_path=None, - depends_on=None, **kwargs): """Constructor for the ExtractRdsStep class @@ -55,9 +54,6 @@ def __init__(self, super(ExtractRdsStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - if table: sql = 'select * from %s;' % table elif sql: diff --git a/dataduct/steps/extract_redshift.py b/dataduct/steps/extract_redshift.py index 3967dcf..e524668 100644 --- a/dataduct/steps/extract_redshift.py +++ b/dataduct/steps/extract_redshift.py @@ -15,7 +15,6 @@ def __init__(self, table, redshift_database, insert_mode="TRUNCATE", - depends_on=None, output_path=None, **kwargs): """Constructor for the ExtractRedshiftStep class @@ -29,9 +28,6 @@ def __init__(self, """ super(ExtractRedshiftStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - # Create input node self._input_node = self.create_pipeline_object( object_class=RedshiftNode, diff --git a/dataduct/steps/load_redshift.py b/dataduct/steps/load_redshift.py index 870db70..f76c8f5 100644 --- a/dataduct/steps/load_redshift.py +++ b/dataduct/steps/load_redshift.py @@ -17,7 +17,6 @@ def __init__(self, insert_mode="TRUNCATE", max_errors=None, replace_invalid_char=None, - depends_on=None, **kwargs): """Constructor for the LoadRedshiftStep class @@ -32,9 +31,6 @@ def __init__(self, """ super(LoadRedshiftStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - # Create output node self._output = self.create_pipeline_object( object_class=RedshiftNode, diff --git a/dataduct/steps/pipeline_dependencies.py b/dataduct/steps/pipeline_dependencies.py new file mode 100644 index 0000000..ab55e3e --- /dev/null +++ b/dataduct/steps/pipeline_dependencies.py @@ -0,0 +1,71 @@ +""" +ETL step for pipeline dependencies using transform step +""" +import os + +from .transform import TransformStep +from ..utils import constants as const + + +class PipelineDependenciesStep(TransformStep): + """PipelineDependencies Step class that helps wait for other pipelines + to finish + """ + + def __init__(self, + id, + dependent_pipelines=None, + refresh_rate=300, + start_date=None, + script_arguments=None, + **kwargs): + """Constructor for the QATransformStep class + + Args: + sns_arn(str): sns topic arn for QA steps + script_arguments(list of str): list of arguments to the script + **kwargs(optional): Keyword arguments directly passed to base class + """ + + if script_arguments is None: + script_arguments = list() + + if dependent_pipelines is None: + raise ValueError('Must have some dependencies for dependency step') + + if start_date is None: + start_date = "#{format(@scheduledStartTime,'YYYY-MM-dd')}" + + script_arguments.extend( + [ + '--start_date=%s' % start_date, + '--refresh_rate=%s' % str(refresh_rate), + '--dependencies', + ] + ) + script_arguments.extend(dependent_pipelines) + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.DEPENDENCY_SCRIPT_PATH) + + super(PipelineDependenciesStep, self).__init__( + id=id, + script=script, + script_arguments=script_arguments, + **kwargs) + + self._output = None + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.ec2_resource + + return step_args diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index 8987ca8..bbd82a8 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -1,5 +1,5 @@ """ -ETL step wrapper for QA step can be executed on Ec2 / EMR +ETL step wrapper for QA step can be executed on Ec2 resource """ from .transform import TransformStep from ..config import Config @@ -31,8 +31,8 @@ def __init__(self, script_arguments.extend( [ - "--sns_topic_arn=%s" % sns_topic_arn, - "--test_name=%s" % (pipeline_name + "." + id) + '--sns_topic_arn=%s' % sns_topic_arn, + '--test_name=%s' % (pipeline_name + "." + id) ] ) diff --git a/dataduct/steps/scripts/pipeline_dependency_check.py b/dataduct/steps/scripts/pipeline_dependency_check.py new file mode 100644 index 0000000..704adc8 --- /dev/null +++ b/dataduct/steps/scripts/pipeline_dependency_check.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +""" +Allows pipeline to have dependencies with other pipelines + +Expected behaviour of dependency step: + +1) If pipeline X does not depend on anything (dependency list is empty ""), + then the transform step should exit safely (sys.exit) + +2) Assume pipeline X depends on Y. If Y does not exist, then throw an + exception saying "Pipeline Y not found". + +3) Assume pipeline X depends on Y. If pipeline Y just sleeps for 10 minutes, + then pipeline X should not finish until after Y finishes in 10 minutes. + +4) Assume pipeline X depends on Y. Pipeline Y exists but no instances of Y ran + today. Pipeline X should throw an exception saying "Y does not exist today". + +5) Assume pipeline X depends on Y. Pipeline Y was "CANCELED"/"CANCELLED" today. + Pipeline X should throw exception saying "Bad status" + +6) Assume pipeline X depends on Y. Pipeline Y was "TIMEDOUT" today. Pipeline X + should throw exception saying "Bad status" + +7) Assume pipeline X depends on Y. Pipeline Y was "FAILED" today. Pipeline X + should throw exception saying "Bad status" + +8) Assume pipeline X depends on Y. Pipeline Y was "CASCADE_FAILED" today. + Pipeline X should throw exception saying "Bad status" +""" + +import argparse +import sys +import time +from datetime import datetime + +from dataduct.pipeline.utils import list_pipelines +from dataduct.pipeline.utils import list_pipeline_instances + + +# Docs and API spelling of "CANCELED" don't match +FAILED_STATUSES = set(['CANCELED', 'CANCELLED', 'TIMEDOUT', 'FAILED', + 'CASCADE_FAILED']) + +# Pipeline attributes +STATUS = '@status' +START_TIME = '@scheduledStartTime' +FINISHED = 'FINISHED' + + +def check_dependencies_ready(dependencies, start_date): + """Checks if every dependent pipeline has completed + + Args: + dependencies(list of str): list of pipeline name that it depends on + start_date(str): string representing the start date of the pipeline + """ + + print 'Checking dependency at ', str(datetime.now()) + + dependency_ready = True + + # Convert date string to datetime object + start_date = datetime.strptime(start_date, '%Y-%m-%d') + + for pipeline in dependencies: + # Get instances of each pipeline + instances = list_pipeline_instances(pipeline) + + # Collect all pipeline instances that are scheduled for today + instances_today = [] + for instance in instances: + date = datetime.strptime(instance[START_TIME], '%Y-%m-%dT%H:%M:%S') + if date.date() == start_date.date(): + instances_today.append(instance) + + # Dependency pipeline has not started from today + if not instances_today: + dependency_ready = False + + for instance in instances_today: + # One of the dependency failed/cancelled + if instance[STATUS] in FAILED_STATUSES: + raise Exception( + 'Pipeline %s has bad status: %s' + % (pipeline, instance[STATUS]) + ) + # Dependency is still running + elif instance[STATUS] != FINISHED: + dependency_ready = False + + # All dependencies are done + return dependency_ready + + +def main(): + """ + Main Function + """ + parser = argparse.ArgumentParser() + parser.add_argument( + '--dependencies', type=str, nargs='+', default=None) + parser.add_argument('--refresh_rate', dest='refresh_rate', default='900') + parser.add_argument('--start_date', dest='start_date') + + args = parser.parse_args() + + # Exit if there are no dependencies + if not args.dependencies: + sys.exit() + + # Create mapping from pipeline name to id + pipeline_name_to_id = dict( + (pipeline['name'], pipeline['id']) for pipeline in list_pipelines() + ) + + # Remove whitespace from dependency list + dependencies = map(str.strip, args.dependencies) + + # Check if all dependencies are valid pipelines + for dependency in dependencies: + if dependency not in pipeline_name_to_id: + raise Exception('Pipeline not found: %s.' % dependency) + + # Map from pipeline object to pipeline ID + dependencies = [pipeline_name_to_id[dependency] + for dependency in dependencies] + + print 'Start checking for dependencies' + start_time = datetime.now() + + # Loop until all dependent pipelines have finished + while not check_dependencies_ready(dependencies, args.start_date): + print 'checking' + time.sleep(float(args.refresh_rate)) + + print 'Finished checking for dependencies. Total time spent: ', + print (datetime.now() - start_time).total_seconds(), ' seconds' + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 267a645..ab89686 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -18,7 +18,6 @@ def __init__(self, script_arguments=None, queue=None, command=None, - depends_on=None, **kwargs): """Constructor for the SqlCommandStep class @@ -35,9 +34,6 @@ def __init__(self, super(SqlCommandStep, self).__init__(**kwargs) - if depends_on is not None: - self._depends_on = depends_on - # Create S3File with script / command provided if script: script = self.create_script(S3File(path=script)) diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 0968b08..ff923c8 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -28,7 +28,6 @@ def __init__(self, output_node=None, script_arguments=None, additional_s3_files=None, - depends_on=None, output_path=None, **kwargs): """Constructor for the TransformStep class @@ -49,16 +48,17 @@ def __init__(self, raise ETLInputError( 'Only one of script, command and directory allowed') - if depends_on is not None: - self._depends_on = depends_on - # Create output_node based on output_path base_output_node = self.create_s3_data_node( self.get_output_s3_path(output_path)) script_arguments = self.translate_arguments(script_arguments) - input_nodes = [self.input] + if self.input: + input_nodes = [self.input] + else: + input_nodes = list() + if script_directory: # The script to be run with the directory if script_name is None: diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index f5023af..41d6df5 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -3,3 +3,4 @@ EMR_CLUSTER_STR = 'emr' SCRIPT_RUNNER_PATH = 'scripts/script_runner.py' +DEPENDENCY_SCRIPT_PATH = 'scripts/pipeline_dependency_check.py' diff --git a/examples/example_pipeline_dependency.yaml b/examples/example_pipeline_dependency.yaml new file mode 100644 index 0000000..3e0d889 --- /dev/null +++ b/examples/example_pipeline_dependency.yaml @@ -0,0 +1,14 @@ +name : example_pipeline_dependency +frequency : one-time +load_time: 01:00 # Hour:Min in UTC + +steps: +- step_type: pipeline-dependencies + name: dependency_step + refresh_rate: 60 + dependent_pipelines: + - example_transform + +- step_type: transform + depends_on: dependency_step + command: whoami >> $OUTPUT1_STAGING_DIR/output.txt From 6b8cc1ebedf92e8746f52a4d61d1c2ffcfa2a5b7 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 5 Jan 2015 16:50:51 -0800 Subject: [PATCH 031/175] share constants cleanly across the project --- dataduct/config/example_config | 51 ++++----------------- dataduct/data_access/connection.py | 2 +- dataduct/etl/etl_pipeline.py | 30 ++++++------ dataduct/pipeline/copy_activity.py | 5 +- dataduct/pipeline/ec2_resource.py | 11 +++-- dataduct/pipeline/emr_activity.py | 3 +- dataduct/pipeline/emr_resource.py | 21 +++++---- dataduct/pipeline/redshift_copy_activity.py | 5 +- dataduct/pipeline/s3_node.py | 3 +- dataduct/pipeline/schedule.py | 3 +- dataduct/pipeline/shell_command_activity.py | 6 +-- dataduct/pipeline/sns_alarm.py | 3 +- dataduct/pipeline/sql_activity.py | 6 +-- dataduct/steps/etl_step.py | 3 +- dataduct/steps/extract_rds.py | 3 ++ dataduct/utils/constants.py | 18 ++++++++ 16 files changed, 84 insertions(+), 89 deletions(-) diff --git a/dataduct/config/example_config b/dataduct/config/example_config index 9aecaca..819ad05 100644 --- a/dataduct/config/example_config +++ b/dataduct/config/example_config @@ -1,48 +1,17 @@ # Constants that are used across the dataduct library ec2: - ROLE: FILL_ME_IN - RESOURCE_ROLE: FILL_ME_IN - INSTANCE_TYPE: m1.large - ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - KEY_PAIR: FILL_ME_IN - SECURITY_GROUP: FILL_ME_IN + INSTANCE_TYPE: m1.large + ETL_AMI: ami-05355a6c # Default AMI used by data pipeline + SECURITY_GROUP: FILL_ME_IN emr: - NUM_CORE_INSTANCES: 3 - CORE_INSTANCE_TYPE: m1.large - TASK_INSTANCE_BID_PRICE: null # null if we want it to be None - TASK_INSTANCE_TYPE: m1.large - MASTER_INSTANCE_TYPE: m1.large - CLUSTER_TIMEOUT: 6 Hours - HADOOP_VERSION: null - HIVE_VERSION: null - PIG_VERSION: null - CLUSTER_AMI: 2.4.7 - -redshift: - DATABASE_NAME: FILL_ME_IN - CLUSTER_ID: FILL_ME_IN - USERNAME: FILL_ME_IN - PASSWORD: FILL_ME_IN - -mysql: - DATABASE_KEY: - HOST: FILL_ME_IN, - USERNAME: FILL_ME_IN, - PASSWORD: FILL_ME_IN + MASTER_INSTANCE_TYPE: m1.large + NUM_CORE_INSTANCES: 1 + CORE_INSTANCE_TYPE: m1.large + CLUSTER_AMI: 2.4.7 etl: - RETRY_DELAY: 10 Minutes - MAX_RETRIES: 0 - S3_ETL_BUCKET: FILL_ME_IN - SNS_TOPIC_ARN_FAILURE: FILL_ME_IN - SNS_TOPIC_ARN_WARNING: FILL_ME_IN - DAILY_LOAD_TIME: 1 # run at 1AM UTC - -bootstrap: - - step_type: transform - input_node: [] - command: whoami >> ${OUTPUT1_STAGING_DIR}/output.txt - resource: FILL_ME_IN - name: bootstrap_transform + S3_ETL_BUCKET: FILL_ME_IN + ROLE: FILL_ME_IN + RESOURCE_ROLE: FILL_ME_IN diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py index 39cc36b..27fe3e9 100644 --- a/dataduct/data_access/connection.py +++ b/dataduct/data_access/connection.py @@ -15,7 +15,7 @@ def redshift_connection(**kwargs): """Fetch a psql connection object to redshift """ if not hasattr(config, 'redshift'): - raise ETLConfigError('Redshift not found in dataduct configs') + raise ETLConfigError('Redshift config not found') connection = psycopg2.connect( host=config.redshift['HOST'], diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index bcda822..176c46e 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -40,15 +40,9 @@ config = Config() S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] -MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) -S3_BASE_PATH = config.etl.get('S3_BASE_PATH', '') -SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) - -EC2_RESOURCE_STR = 'ec2' -LOG_STR = 'logs' -DATA_STR = 'data' -SRC_STR = 'src' -CUSTOM_STEPS_PATH = 'CUSTOM_STEPS_PATH' +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +S3_BASE_PATH = config.etl.get('S3_BASE_PATH', const.EMPTY_STR) +SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) class ETLPipeline(object): @@ -222,17 +216,19 @@ def _s3_uri(self, data_type): Returns: s3_path(S3Path): S3 location of directory of the given data type """ - if data_type not in [SRC_STR, LOG_STR, DATA_STR]: + if data_type not in [const.SRC_STR, const.LOG_STR, const.DATA_STR]: raise ETLInputError('Unknown data type found') # Versioning prevents using data from older versions key = [S3_BASE_PATH, data_type, self.name, self.version_name] - if self.frequency == 'daily' and data_type in [LOG_STR, DATA_STR]: + if self.frequency == 'daily' and \ + data_type in [const.LOG_STR, const.DATA_STR]: + # For repeated loads, include load date key.append("#{format(@scheduledStartTime, 'YYYYMMdd')}") - if data_type == LOG_STR: + if data_type == const.LOG_STR: return S3LogPath(key, bucket=S3_ETL_BUCKET, is_directory=True) else: return S3Path(key, bucket=S3_ETL_BUCKET, is_directory=True) @@ -244,7 +240,7 @@ def s3_log_dir(self): Returns: s3_dir(S3Directory): Directory where s3 log will be stored. """ - return self._s3_uri(LOG_STR) + return self._s3_uri(const.LOG_STR) @property def s3_data_dir(self): @@ -253,7 +249,7 @@ def s3_data_dir(self): Returns: s3_dir(S3Directory): Directory where s3 data will be stored. """ - return self._s3_uri(DATA_STR) + return self._s3_uri(const.DATA_STR) @property def s3_source_dir(self): @@ -262,7 +258,7 @@ def s3_source_dir(self): Returns: s3_dir(S3Directory): Directory where s3 src will be stored. """ - return self._s3_uri(SRC_STR) + return self._s3_uri(const.SRC_STR) @property def ec2_resource(self): @@ -282,7 +278,7 @@ def ec2_resource(self): terminate_after=self.ec2_resource_terminate_after, ) - self.create_bootstrap_steps(EC2_RESOURCE_STR) + self.create_bootstrap_steps(const.EC2_RESOURCE_STR) return self._ec2_resource @property @@ -396,7 +392,7 @@ def get_custom_steps(): for step_def in getattr(config, 'custom_steps', list()): step_type = step_def['step_type'] - path = parse_path(step_def['file_path'], CUSTOM_STEPS_PATH) + path = parse_path(step_def['file_path'], 'CUSTOM_STEPS_PATH') # Load source from the file path provided step_mod = imp.load_source(step_type, path) diff --git a/dataduct/pipeline/copy_activity.py b/dataduct/pipeline/copy_activity.py index 1809d5a..4fc865d 100644 --- a/dataduct/pipeline/copy_activity.py +++ b/dataduct/pipeline/copy_activity.py @@ -6,11 +6,12 @@ from .schedule import Schedule from ..config import Config +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) -RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class CopyActivity(Activity): diff --git a/dataduct/pipeline/ec2_resource.py b/dataduct/pipeline/ec2_resource.py index 39b4880..b648ef8 100644 --- a/dataduct/pipeline/ec2_resource.py +++ b/dataduct/pipeline/ec2_resource.py @@ -6,17 +6,18 @@ from .pipeline_object import PipelineObject from ..s3 import S3LogPath from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() ROLE = config.etl['ROLE'] RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] -INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', 'm1.large') -ETL_AMI = config.ec2.get('ETL_AMI', None) -SECURITY_GROUP = config.ec2.get('SECURITY_GROUP', None) -KEY_PAIR = config.etl.get('KEY_PAIR', None) -RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') +INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', const.M1_LARGE) +ETL_AMI = config.ec2.get('ETL_AMI', const.NONE) +SECURITY_GROUP = config.ec2.get('SECURITY_GROUP', const.NONE) +KEY_PAIR = config.etl.get('KEY_PAIR', const.NONE) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class Ec2Resource(PipelineObject): diff --git a/dataduct/pipeline/emr_activity.py b/dataduct/pipeline/emr_activity.py index cea06a2..4351d60 100644 --- a/dataduct/pipeline/emr_activity.py +++ b/dataduct/pipeline/emr_activity.py @@ -5,10 +5,11 @@ from .activity import Activity from ..config import Config from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) class EmrActivity(Activity): diff --git a/dataduct/pipeline/emr_resource.py b/dataduct/pipeline/emr_resource.py index a5f6ff4..717a24a 100644 --- a/dataduct/pipeline/emr_resource.py +++ b/dataduct/pipeline/emr_resource.py @@ -6,20 +6,21 @@ from .pipeline_object import PipelineObject from ..s3 import S3LogPath from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -NUM_CORE_INSTANCES = config.emr.get('NUM_CORE_INSTANCES', None) -CORE_INSTANCE_TYPE = config.emr.get('CORE_INSTANCE_TYPE', 'm1.large') -TASK_INSTANCE_BID_PRICE = config.emr.get('TASK_INSTANCE_BID_PRICE', None) -TASK_INSTANCE_TYPE = config.emr.get('TASK_INSTANCE_TYPE', 'm1.large') -MASTER_INSTANCE_TYPE = config.emr.get('MASTER_INSTANCE_TYPE', 'm1.large') -CLUSTER_TIMEOUT = config.emr.get('CLUSTER_TIMEOUT', '6 Hours') -HADOOP_VERSION = config.emr.get('HADOOP_VERSION', None) -HIVE_VERSION = config.emr.get('HIVE_VERSION', None) -PIG_VERSION = config.emr.get('PIG_VERSION', None) +NUM_CORE_INSTANCES = config.emr.get('NUM_CORE_INSTANCES', const.NONE) +CORE_INSTANCE_TYPE = config.emr.get('CORE_INSTANCE_TYPE', const.M1_LARGE) +TASK_INSTANCE_BID_PRICE = config.emr.get('TASK_INSTANCE_BID_PRICE', const.NONE) +TASK_INSTANCE_TYPE = config.emr.get('TASK_INSTANCE_TYPE', const.M1_LARGE) +MASTER_INSTANCE_TYPE = config.emr.get('MASTER_INSTANCE_TYPE', const.M1_LARGE) +CLUSTER_TIMEOUT = config.emr.get('CLUSTER_TIMEOUT', const.DEFAULT_TIMEOUT) +HADOOP_VERSION = config.emr.get('HADOOP_VERSION', const.NONE) +HIVE_VERSION = config.emr.get('HIVE_VERSION', const.NONE) +PIG_VERSION = config.emr.get('PIG_VERSION', const.NONE) CLUSTER_AMI = config.emr.get('CLUSTER_AMI', '2.4.7') -KEY_PAIR = config.etl.get('KEY_PAIR', None) +KEY_PAIR = config.etl.get('KEY_PAIR', const.NONE) class EmrResource(PipelineObject): diff --git a/dataduct/pipeline/redshift_copy_activity.py b/dataduct/pipeline/redshift_copy_activity.py index 449cde6..0346735 100644 --- a/dataduct/pipeline/redshift_copy_activity.py +++ b/dataduct/pipeline/redshift_copy_activity.py @@ -6,11 +6,12 @@ from ..config import Config from .redshift_node import RedshiftNode from .schedule import Schedule +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) -RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class RedshiftCopyActivity(Activity): diff --git a/dataduct/pipeline/s3_node.py b/dataduct/pipeline/s3_node.py index e9f10f7..ad48a57 100644 --- a/dataduct/pipeline/s3_node.py +++ b/dataduct/pipeline/s3_node.py @@ -10,10 +10,11 @@ from ..s3 import S3Path from ..s3 import S3File from ..s3 import S3Directory +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class S3Node(PipelineObject): diff --git a/dataduct/pipeline/schedule.py b/dataduct/pipeline/schedule.py index 3da33bf..35533e4 100644 --- a/dataduct/pipeline/schedule.py +++ b/dataduct/pipeline/schedule.py @@ -6,10 +6,11 @@ from ..config import Config from .pipeline_object import PipelineObject +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -DAILY_LOAD_TIME = config.etl.get('DAILY_LOAD_TIME', 1) +DAILY_LOAD_TIME = config.etl.get('DAILY_LOAD_TIME', const.ONE) FEQUENCY_PERIOD_CONVERTION = { diff --git a/dataduct/pipeline/shell_command_activity.py b/dataduct/pipeline/shell_command_activity.py index 114f427..69f311c 100644 --- a/dataduct/pipeline/shell_command_activity.py +++ b/dataduct/pipeline/shell_command_activity.py @@ -5,12 +5,12 @@ from .activity import Activity from ..config import Config from .schedule import Schedule - +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) -RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class ShellCommandActivity(Activity): diff --git a/dataduct/pipeline/sns_alarm.py b/dataduct/pipeline/sns_alarm.py index 395fecd..b9f30cd 100644 --- a/dataduct/pipeline/sns_alarm.py +++ b/dataduct/pipeline/sns_alarm.py @@ -4,9 +4,10 @@ from ..config import Config from .pipeline_object import PipelineObject +from ..utils import constants as const config = Config() -SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', None) +SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) ROLE = config.etl['ROLE'] diff --git a/dataduct/pipeline/sql_activity.py b/dataduct/pipeline/sql_activity.py index cecadc1..2eb9767 100644 --- a/dataduct/pipeline/sql_activity.py +++ b/dataduct/pipeline/sql_activity.py @@ -6,12 +6,12 @@ from ..config import Config from .schedule import Schedule from ..s3 import S3File - +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) -RETRY_DELAY = config.etl.get('RETRY_DELAY', '10 Minutes') +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) +RETRY_DELAY = config.etl.get('RETRY_DELAY', const.DEFAULT_DELAY) class SqlActivity(Activity): diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index c71debd..8e13815 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -8,10 +8,11 @@ from ..s3 import S3Path from ..s3 import S3File from ..s3 import S3LogPath +from ..utils import constants as const from ..utils.exceptions import ETLInputError config = Config() -MAX_RETRIES = config.etl.get('MAX_RETRIES', 0) +MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) class ETLStep(object): diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index 6c8f3bf..e7c9107 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -13,6 +13,9 @@ from ..utils.exceptions import ETLInputError config = Config() +if not hasattr(config, 'mysql'): + raise ETLInputError('MySQL config not specified in ETL') + MYSQL_CONFIG = config.mysql diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index 41d6df5..56a45ea 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -1,6 +1,24 @@ """Constants shared across dataduct """ +# Constants +ZERO = 0 +ONE = 1 +NONE = None +EMPTY_STR = '' +NULL_STR = 'NULL' +DEFAULT_DELAY = '10 Minutes' +DEFAULT_TIMEOUT = '6 Hours' + +# ETL Constants EMR_CLUSTER_STR = 'emr' +EC2_RESOURCE_STR = 'ec2' +M1_LARGE = 'm1.large' + +LOG_STR = 'logs' +DATA_STR = 'data' +SRC_STR = 'src' + +# Step paths SCRIPT_RUNNER_PATH = 'scripts/script_runner.py' DEPENDENCY_SCRIPT_PATH = 'scripts/pipeline_dependency_check.py' From 43695c6f7f096ac204292e4c567ab0c7aab49f92 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 6 Jan 2015 09:26:44 -0800 Subject: [PATCH 032/175] configure paths more safely --- dataduct/config/config.py | 3 ++- dataduct/utils/constants.py | 7 +++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dataduct/config/config.py b/dataduct/config/config.py index c5eebc3..c1e2cac 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -14,7 +14,8 @@ def get_config_files(): 3. DATADUCT_PATH environment variable """ dataduct_config_path = '/etc/dataduct.cfg' - dataduct_user_config_path = os.path.join(os.path.expanduser('~/.dataduct')) + dataduct_user_config_path = os.path.join( + os.path.expanduser('~'),'.dataduct')) config_files = [dataduct_config_path, dataduct_user_config_path] # Check DATADUCT_PATH env variable for other configuration locations diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index 56a45ea..ad46978 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -1,5 +1,6 @@ """Constants shared across dataduct """ +import os # Constants ZERO = 0 @@ -20,5 +21,7 @@ SRC_STR = 'src' # Step paths -SCRIPT_RUNNER_PATH = 'scripts/script_runner.py' -DEPENDENCY_SCRIPT_PATH = 'scripts/pipeline_dependency_check.py' +SCRIPTS_DIRECTORY = 'scripts' +SCRIPT_RUNNER_PATH = os.path.join(SCRIPTS_DIRECTORY, 'script_runner.py') +DEPENDENCY_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, + 'pipeline_dependency_check.py') From 990feefa812603067822ced8e930ac68828d5936 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 6 Jan 2015 22:45:41 -0800 Subject: [PATCH 033/175] tests in the correct directory --- CHANGES.md | 21 ++++++++++++++++++- MANIFEST.in | 1 - README.rst | 2 +- dataduct/{ => etl}/tests/__init__.py | 0 .../{ => etl}/tests/test_definition_parser.py | 0 requirements.txt | 10 ++++----- 6 files changed, 26 insertions(+), 8 deletions(-) rename dataduct/{ => etl}/tests/__init__.py (100%) rename dataduct/{ => etl}/tests/test_definition_parser.py (100%) diff --git a/CHANGES.md b/CHANGES.md index 174235a..545f227 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,4 +1,4 @@ -# Changes in ETL_Lib +# Changes in dataduct ### 0.1.0 - Initial version of the dataduct library released @@ -12,3 +12,22 @@ - sql-command - transform - Examples and documentation added for all the steps + +### 0.2.0 +- Support for custom steps +- Pipeline dependency step +- Reduce verbosity of imports +- Step parsing is isolated in steps +- More examples for steps +- QA step functions added +- Visualization of pipelines +- Sync config with S3 +- Config overides with modes +- Rename keywords and safe config failure handling +- MySQL and Redshift connection support +- EMR Streaming support with hadoop 2 +- Custom EMR job step +- Support for input_path to steps to directly create S3Nodes +- Transform step to support directory based installs +- Exceptions cleanup +- Read the docs support diff --git a/MANIFEST.in b/MANIFEST.in index 5276a85..8c35769 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -3,4 +3,3 @@ include *.md include *.rst include *.py recursive-include bin * -recursive-include scripts * diff --git a/README.rst b/README.rst index 9582f9b..bea208e 100644 --- a/README.rst +++ b/README.rst @@ -7,7 +7,7 @@ pipeline objects. **Documentation and Details** -Documentation and more details can be found at http://pythonhosted.org/dataduct/ +Documentation and more details can be found at http://dataduct.readthedocs.org/en/latest/ **License** diff --git a/dataduct/tests/__init__.py b/dataduct/etl/tests/__init__.py similarity index 100% rename from dataduct/tests/__init__.py rename to dataduct/etl/tests/__init__.py diff --git a/dataduct/tests/test_definition_parser.py b/dataduct/etl/tests/test_definition_parser.py similarity index 100% rename from dataduct/tests/test_definition_parser.py rename to dataduct/etl/tests/test_definition_parser.py diff --git a/requirements.txt b/requirements.txt index 3673b70..163703d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -Sphinx==1.2.3 -boto==2.34.0 -sphinx-rtd-theme==0.1.6 -sphinxcontrib-napoleon==0.2.8 -pandas==0.14.1 +boto>=2.34.0 +Sphinx>=1.2.3 +sphinx-rtd-theme>=0.1.6 +sphinxcontrib-napoleon>=0.2.8 +pandas>=0.14.1 From cb68ac506ad3aaae459fe9c6e37fe6638334c0d5 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 6 Jan 2015 22:49:10 -0800 Subject: [PATCH 034/175] Initial Commit --- parseql/__init__.py | 4 ++++ parseql/database/__init__.py | 0 parseql/database/tests/__init__.py | 0 parseql/parsers/__init__.py | 0 parseql/parsers/tests/__init__.py | 0 parseql/sql/__init__.py | 0 parseql/sql/tests/__init__.py | 0 7 files changed, 4 insertions(+) create mode 100644 parseql/__init__.py create mode 100644 parseql/database/__init__.py create mode 100644 parseql/database/tests/__init__.py create mode 100644 parseql/parsers/__init__.py create mode 100644 parseql/parsers/tests/__init__.py create mode 100644 parseql/sql/__init__.py create mode 100644 parseql/sql/tests/__init__.py diff --git a/parseql/__init__.py b/parseql/__init__.py new file mode 100644 index 0000000..03b2b5e --- /dev/null +++ b/parseql/__init__.py @@ -0,0 +1,4 @@ +"""Welcome to Parseql +""" +__version__ = '0.1.0' +__import__('pkg_resources').declare_namespace(__name__) diff --git a/parseql/database/__init__.py b/parseql/database/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parseql/database/tests/__init__.py b/parseql/database/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parseql/parsers/__init__.py b/parseql/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parseql/parsers/tests/__init__.py b/parseql/parsers/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parseql/sql/__init__.py b/parseql/sql/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/parseql/sql/tests/__init__.py b/parseql/sql/tests/__init__.py new file mode 100644 index 0000000..e69de29 From b459d96a96fcf655fca6d83202caa53a57030a41 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 7 Jan 2015 16:23:00 -0800 Subject: [PATCH 035/175] Sanitize SQL transformations --- parseql/parsers/__init__.py | 4 + parseql/parsers/tests/test_transfrom.py | 180 ++++++++++++++++++++++++ parseql/parsers/transform.py | 96 +++++++++++++ parseql/sql/sql_script.py | 104 ++++++++++++++ parseql/sql/sql_statement.py | 41 ++++++ parseql/sql/tests/test_sql_statement.py | 43 ++++++ parseql/sql/transaction.py | 24 ++++ parseql/sql/utils.py | 32 +++++ 8 files changed, 524 insertions(+) create mode 100644 parseql/parsers/tests/test_transfrom.py create mode 100644 parseql/parsers/transform.py create mode 100644 parseql/sql/sql_script.py create mode 100644 parseql/sql/sql_statement.py create mode 100644 parseql/sql/tests/test_sql_statement.py create mode 100644 parseql/sql/transaction.py create mode 100644 parseql/sql/utils.py diff --git a/parseql/parsers/__init__.py b/parseql/parsers/__init__.py index e69de29..99edcd8 100644 --- a/parseql/parsers/__init__.py +++ b/parseql/parsers/__init__.py @@ -0,0 +1,4 @@ +from .transform import remove_comments +from .transform import remove_empty_statements +from .transform import remove_transactional +from .transform import split_statements diff --git a/parseql/parsers/tests/test_transfrom.py b/parseql/parsers/tests/test_transfrom.py new file mode 100644 index 0000000..18fe2c0 --- /dev/null +++ b/parseql/parsers/tests/test_transfrom.py @@ -0,0 +1,180 @@ +"""Tests for the transformation steps +""" +from unittest import TestCase +from nose.tools import eq_ + +from ..transform import split_statements +from ..transform import remove_comments +from ..transform import remove_empty_statements +from ..transform import remove_transactional + + +class TestRemoveEmptyStatements(TestCase): + """Tests for remove_empty_statements function + """ + @staticmethod + def test_basic(): + """Basic test for single location of seperator + """ + data = 'a;;;' + result = 'a;' + + eq_(remove_empty_statements(data), result) + + @staticmethod + def test_multiple_statements_single_duplication(): + """Test for multiple locations of seperator with single duplication + """ + data = 'a; b;; c;' + result = 'a; b; c;' + + eq_(remove_empty_statements(data), result) + + @staticmethod + def test_multiple_statements_multiple_duplication(): + """Test for multiple locations of seperator with multiple duplication + """ + data = 'a;;; b;; c;;;' + result = 'a; b; c;' + + eq_(remove_empty_statements(data), result) + + @staticmethod + def test_start_empty(): + """Test for removing an empty statement at start + """ + data = '; a; ; ;;; b; c;;;' + result = ' a; b; c;' + + eq_(remove_empty_statements(data), result) + + +class TestRemoveComments(TestCase): + """Tests for remove_comments function + """ + @staticmethod + def test_multiline_comment(): + """Basic test for removing multiline comments + """ + data = 'a; /* This is \n \n a multiline comment */ b;' + result = 'a; b;' + + eq_(remove_comments(data), result) + + @staticmethod + def test_singleline_comment_basic(): + """Basic test for removing singleline comments + """ + data = 'a; b; --Comment' + result = 'a; b; ' + + eq_(remove_comments(data), result) + + @staticmethod + def test_singleline_comment_advanced(): + """Advanced test for removing singleline comments + """ + data = '-- Comment \n a; b;' + result = '\n a; b;' + + eq_(remove_comments(data), result) + + @staticmethod + def test_singleline_multiline_comment(): + """Advanced test for removing singleline comments + """ + data = 'a; /* This is \n \n a multiline comment */ b;-- Comment ' + result = 'a; b;' + + eq_(remove_comments(data), result) + + +class TestRemoveTransactional(TestCase): + """Tests for remove_transactional function + """ + @staticmethod + def test_remove_none(): + """Basic test for removing nothing + """ + data = 'a; b;' + result = 'a; b;' + + eq_(remove_transactional(data), result) + + @staticmethod + def test_remove_begin(): + """Basic test for removing begin + """ + data = 'begin; a; b;' + result = ' a; b;' + + eq_(remove_empty_statements(remove_transactional(data)), result) + + @staticmethod + def test_remove_commit(): + """Basic test for removing commit + """ + data = 'a; b; commit;' + result = 'a; b;' + + eq_(remove_empty_statements(remove_transactional(data)), result) + + @staticmethod + def test_remove_begin_commit(): + """Basic test for removing begin & commit + """ + data = 'begin; a; b; commit;' + result = ' a; b;' + + eq_(remove_empty_statements(remove_transactional(data)), result) + + @staticmethod + def test_just_begin_commit(): + """Basic test for removing begin & commit + """ + data = 'begin; commit;' + result = '' + + eq_(remove_empty_statements(remove_transactional(data)), result) + +class TestSplitOmitQuoted(TestCase): + """Tests for split_statements function + """ + @staticmethod + def test_basic(): + """Basic test for spliting a string based on the seperator + """ + data = 'a; b \n t; c; d ; ' + result = ['a', 'b \n t', 'c', 'd'] + + eq_(split_statements(data), result) + + @staticmethod + def test_newline_sql(): + """Split SQL statement with newlines + """ + data = 'a; b \n e; c; \n \n d ; ' + result = ['a', 'b \n e', 'c', 'd'] + + eq_(split_statements(data), result) + + @staticmethod + def test_paran_sql(): + """Split SQL statement with paranthesis + """ + data = 'a; b (x\n,y,z) d; c; \n \n d ; ' + result = ['a', 'b (x\n,y,z) d', 'c', 'd'] + + eq_(split_statements(data), result) + + @staticmethod + def test_multiple_sql(): + """Advanced test with removing comments and empty sql statements + """ + data = """a; /* This is \n + a multiline comment */ b;; \n ; -- Comment \n c; d; """ + + result = ['a', 'b', 'c', 'd'] + + eq_(split_statements(remove_empty_statements( + remove_comments(data))), result) diff --git a/parseql/parsers/transform.py b/parseql/parsers/transform.py new file mode 100644 index 0000000..94bf858 --- /dev/null +++ b/parseql/parsers/transform.py @@ -0,0 +1,96 @@ +"""Module containing basic transform functions on strings +""" + +from pyparsing import CaselessKeyword +from pyparsing import CharsNotIn +from pyparsing import delimitedList +from pyparsing import Literal +from pyparsing import nestedExpr +from pyparsing import OneOrMore +from pyparsing import originalTextFor +from pyparsing import printables +from pyparsing import replaceWith +from pyparsing import Word +from pyparsing import ZeroOrMore + + +def remove_empty_statements(string, seperator=';'): + """Remove empty statements from the string + + Args: + string(str): String to be processed + seperator(str): Seperater to be checked for duplicates + + Returns: + result(str): String with empty statements trimmed + """ + if string == '': + return string + + empty_statement = seperator + OneOrMore(seperator) + empty_statement.setParseAction(replaceWith(seperator)) + string = empty_statement.transformString(string) + + return string.lstrip(seperator) + + +def remove_comments(string): + """Remove comments from the statements + + Args: + string(str): String to be processed + + Returns: + result(str): String with comments trimmed + """ + + if string == '': + return string + + # Remove multiline comments + multiline_comment = nestedExpr('/*', '*/').suppress() + string = multiline_comment.transformString(string) + + # Remove single line comments + singleline_comment = Literal('--') + ZeroOrMore(CharsNotIn('\n')) + string = singleline_comment.suppress().transformString(string) + + return string + + +def remove_transactional(string): + """Remove begin or commit from the statement + + Args: + string(str): String to be processed + + Returns: + result(str): String with begin and commit trimmed + """ + transaction = (CaselessKeyword('BEGIN')| CaselessKeyword('COMMIT')) + return transaction.suppress().transformString(string) + + +def split_statements(string, seperator=';'): + """Seperate the string based on the seperator + + Args: + string(str): String to be processed + seperator(str): Seperater to split the statements + + Returns: + result(list of str): Statements split based on the seperator + """ + if string == '': + return [] + + # words can contain anything but the seperator + printables_less_seperator = printables.replace(seperator, '') + + # capture content between seperators, and preserve original text + content = originalTextFor(OneOrMore(Word(printables_less_seperator))) + + # process the string + tokens = delimitedList(content, seperator).parseString(string) + + return tokens.asList() diff --git a/parseql/sql/sql_script.py b/parseql/sql/sql_script.py new file mode 100644 index 0000000..7dfd8ad --- /dev/null +++ b/parseql/sql/sql_script.py @@ -0,0 +1,104 @@ +"""Script that contains the sql script class +""" +from copy import deepcopy + +from .sql_statement import SqlStatement +from .transaction import BeginStatement +from .transaction import CommitStatement +from .utils import atmost_one +from .utils import sanatize_sql + + +class SqlScript(object): + """Class representing a single SQL Script + """ + def __init__(self, sql=None, statements=None, filename=None): + """Constructor for the SqlScript class + """ + assert atmost_one(sql, statements, filename), 'Multiple intializer' + + if sql is None: + sql = '' + + if filename: + with open(filename, 'r') as f: + sql = f.read() + + self._raw_sql = sql + self._raw_statements = self._sanatize_sql() + self._statements = self.initialize_statements() + + # Add the statements that the script was initialized from + self.append(statements) + + def __str__(self): + """Print a SqlScript object + """ + return self.sql() + + def __iter__(self): + """Iterator for iterating over all the sql statements + """ + return iter(self._statements) + + def __len__(self): + """Length of the sqlscript + """ + return len(self._statements) + + @property + def statements(self): + """Returns the SQLStatements of the script + """ + return self._statements + + def sql(self): + """Returns the sql for the SqlScript + """ + return ';\n'.join([x.sql() for x in self._statements]) + + def _sanatize_sql(self): + """Clean the SQL, remove comments and empty statements + """ + return sanatize_sql(self._raw_sql) + + def _initialize_statements(self): + """Initialize SQL Statements based on the inputscipt + """ + return [SqlStatement(x) for x in self._raw_statements] + + def copy(self): + """Create a copy of the SQL Script object + """ + return deepcopy(self) + + def append(self, elements): + """Append the elements to the SQL script + """ + if isinstance(elements, SqlStatement): + self.add_statement(elements) + return self.copy() + + if isinstance(elements, str): + elements = self.__class__(elements) + + for element in elements: + self.add_statement(element) + + return self.copy() + + def add_statement(self, statement): + """Add a single SqlStatement to the SQL Script + """ + if not isinstance(statement, SqlStatement): + raise ValueError('Input must be of the type SqlStatement') + + self._statements.append(statement) + self._raw_statements.append(statement.sql()) + + def wrap_transaction(self): + """Wrap the script in transaction + """ + new_script = self.__class__() + new_script.append([BeginStatement(), self, CommitStatement()]) + return new_script diff --git a/parseql/sql/sql_statement.py b/parseql/sql/sql_statement.py new file mode 100644 index 0000000..ecbe0f7 --- /dev/null +++ b/parseql/sql/sql_statement.py @@ -0,0 +1,41 @@ +"""Script that contains the sql statement class +""" + +from .utils import sanatize_sql + + +class SqlStatement(object): + """Class representing a single SQL statement + """ + def __init__(self, sql=None): + """Constructor for the SqlStatement class + """ + if sql is None: + sql = '' + self._raw_sql = sql + self._raw_statement = self._sanatize_sql() + + def __str__(self): + """Print a SqlStatement object + """ + return self.sql() + + def sql(self): + """Returns the raw_sql for the SqlStatement + """ + return self._raw_statement + + def _sanatize_sql(self): + """Clean the SQL, remove comments and empty statements + """ + if self._raw_sql is None: + return '' + + raw_statements = sanatize_sql(self._raw_sql) + + if len(raw_statements) > 1: + raise ValueError('SQL Statement can not contain more than 1 query') + elif len(raw_statements) == 1: + return raw_statements[0] + else: + return '' diff --git a/parseql/sql/tests/test_sql_statement.py b/parseql/sql/tests/test_sql_statement.py new file mode 100644 index 0000000..3d0b684 --- /dev/null +++ b/parseql/sql/tests/test_sql_statement.py @@ -0,0 +1,43 @@ +"""Tests for the SqlStatement class +""" +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import raises + +from ..sql_statement import SqlStatement + + +class TestSqlStatement(TestCase): + """Tests for sql statement function + """ + @staticmethod + def test_basic(): + """Basic test for statement declaration + """ + query = 'select \n 1;' + result = 'select \n 1' + + eq_(SqlStatement(query).sql(), result) + + @staticmethod + def test_sanatization(): + """Sanatization of comments + """ + query = 'select 1 -- test connect \n;' + result = 'select 1' + + eq_(SqlStatement(query).sql(), result) + + @staticmethod + @raises(ValueError) + def test_error(): + """Raise error if multiple queries are passed + """ + query = 'select 1; select 2;' + SqlStatement(query) + + @staticmethod + def test_empty_declaration(): + """Empty if no sql query is passed + """ + eq_(SqlStatement().sql(), '') diff --git a/parseql/sql/transaction.py b/parseql/sql/transaction.py new file mode 100644 index 0000000..4e0ae95 --- /dev/null +++ b/parseql/sql/transaction.py @@ -0,0 +1,24 @@ +"""SQL Statements used in transactions +""" + +from .sql_statement import SqlStatement + + +class BeginStatement(SqlStatement): + """Class representing begin sql statement + """ + def __init__(self): + """Constructor for begin class + """ + sql = 'BEGIN' + super(BeginStatement, self).__init__(sql) + + +class CommitStatement(SqlStatement): + """Class representing Commit sql statement + """ + def __init__(self): + """Constructor for Commit class + """ + sql = 'COMMIT' + super(CommitStatement, self).__init__(sql) diff --git a/parseql/sql/utils.py b/parseql/sql/utils.py new file mode 100644 index 0000000..e16b20b --- /dev/null +++ b/parseql/sql/utils.py @@ -0,0 +1,32 @@ +""" +Shared utility functions +""" +from ..parsers import remove_comments +from ..parsers import remove_empty_statements +from ..parsers import split_statements +from ..parsers import remove_transactional + + +def atmost_one(*args): + """Asserts one of the arguments is not None + + Returns: + result(bool): True if exactly one of the arguments is not None + """ + return sum([1 for a in args if a is not None]) <= 1 + + +def sanatize_sql(sql): + """Sanatize the sql string + """ + # remove comments + string = remove_comments(sql) + + # remove transactionals + string = remove_transactional(string) + + # remove empty statements + string = remove_empty_statements(string) + + # split into multiple statements + return split_statements(string) From 7b3c302c6e4b0cfea662d194e10dbc1d0544c14f Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 8 Jan 2015 00:02:43 -0800 Subject: [PATCH 036/175] Remove new lines from sql --- parseql/parsers/__init__.py | 1 + parseql/parsers/tests/test_transfrom.py | 32 ++++++++ parseql/parsers/transform.py | 10 +++ parseql/sql/sql_script.py | 11 ++- parseql/sql/sql_statement.py | 2 +- parseql/sql/tests/test_sql_script.py | 104 ++++++++++++++++++++++++ parseql/sql/tests/test_sql_statement.py | 2 +- parseql/sql/transaction.py | 6 +- parseql/sql/utils.py | 9 +- 9 files changed, 165 insertions(+), 12 deletions(-) create mode 100644 parseql/sql/tests/test_sql_script.py diff --git a/parseql/parsers/__init__.py b/parseql/parsers/__init__.py index 99edcd8..5b4fe6d 100644 --- a/parseql/parsers/__init__.py +++ b/parseql/parsers/__init__.py @@ -2,3 +2,4 @@ from .transform import remove_empty_statements from .transform import remove_transactional from .transform import split_statements +from .transform import remove_newlines diff --git a/parseql/parsers/tests/test_transfrom.py b/parseql/parsers/tests/test_transfrom.py index 18fe2c0..531b3da 100644 --- a/parseql/parsers/tests/test_transfrom.py +++ b/parseql/parsers/tests/test_transfrom.py @@ -7,6 +7,7 @@ from ..transform import remove_comments from ..transform import remove_empty_statements from ..transform import remove_transactional +from ..transform import remove_newlines class TestRemoveEmptyStatements(TestCase): @@ -49,6 +50,37 @@ def test_start_empty(): eq_(remove_empty_statements(data), result) +class TestRemoveNewLines(TestCase): + """Tests for remove_empty_statements function + """ + @staticmethod + def test_basic(): + """Basic test for single location of seperator + """ + data = 'a\n \n;' + result = 'a ;' + + eq_(remove_newlines(data), result) + + @staticmethod + def test_advanced(): + """Basic test for single location of seperator + """ + data = 'a,\nb,\nc\n\rfrom \r\n xyz' + result = 'a, b, c from xyz' + + eq_(remove_newlines(data), result) + + @staticmethod + def test_quoted_newlines(): + """Basic test for single location of seperator + """ + data = "a,\nb,\nc\n\rfrom \r\n xyz where b='a\nc'" + result = "a, b, c from xyz where b='a\nc'" + + eq_(remove_newlines(data), result) + + class TestRemoveComments(TestCase): """Tests for remove_comments function """ diff --git a/parseql/parsers/transform.py b/parseql/parsers/transform.py index 94bf858..fc308ca 100644 --- a/parseql/parsers/transform.py +++ b/parseql/parsers/transform.py @@ -1,6 +1,8 @@ """Module containing basic transform functions on strings """ +import re + from pyparsing import CaselessKeyword from pyparsing import CharsNotIn from pyparsing import delimitedList @@ -94,3 +96,11 @@ def split_statements(string, seperator=';'): tokens = delimitedList(content, seperator).parseString(string) return tokens.asList() + + +def remove_newlines(string): + """Remove new lines from a string unless in single quotes + """ + # In general the aim is to avoid regex as they are hard to maintain + regex = r"(?:[^\s\n\r']|'(?:\\.|[^'])*')+" + return ' '.join(re.findall(regex, string)) diff --git a/parseql/sql/sql_script.py b/parseql/sql/sql_script.py index 7dfd8ad..6a31476 100644 --- a/parseql/sql/sql_script.py +++ b/parseql/sql/sql_script.py @@ -26,10 +26,11 @@ def __init__(self, sql=None, statements=None, filename=None): self._raw_sql = sql self._raw_statements = self._sanatize_sql() - self._statements = self.initialize_statements() + self._statements = self._initialize_statements() # Add the statements that the script was initialized from - self.append(statements) + if statements: + self.append(statements) def __str__(self): """Print a SqlScript object @@ -55,7 +56,7 @@ def statements(self): def sql(self): """Returns the sql for the SqlScript """ - return ';\n'.join([x.sql() for x in self._statements]) + return ';\n'.join([x.sql() for x in self._statements]) + ';' def _sanatize_sql(self): """Clean the SQL, remove comments and empty statements @@ -100,5 +101,7 @@ def wrap_transaction(self): """Wrap the script in transaction """ new_script = self.__class__() - new_script.append([BeginStatement(), self, CommitStatement()]) + new_script.append( + [BeginStatement()] + self.statements + [CommitStatement()]) + return new_script diff --git a/parseql/sql/sql_statement.py b/parseql/sql/sql_statement.py index ecbe0f7..3355836 100644 --- a/parseql/sql/sql_statement.py +++ b/parseql/sql/sql_statement.py @@ -31,7 +31,7 @@ def _sanatize_sql(self): if self._raw_sql is None: return '' - raw_statements = sanatize_sql(self._raw_sql) + raw_statements = sanatize_sql(self._raw_sql, keep_transaction=True) if len(raw_statements) > 1: raise ValueError('SQL Statement can not contain more than 1 query') diff --git a/parseql/sql/tests/test_sql_script.py b/parseql/sql/tests/test_sql_script.py new file mode 100644 index 0000000..25373eb --- /dev/null +++ b/parseql/sql/tests/test_sql_script.py @@ -0,0 +1,104 @@ +"""Tests for the SqlScript class +""" +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import assert_not_equal + +from ..sql_statement import SqlStatement +from ..sql_script import SqlScript + + +class TestSqlScript(TestCase): + """Tests for sql Script function + """ + @staticmethod + def test_basic(): + """Basic test for Script declaration + """ + query = 'select \n 1;' + result = 'select 1;' + + eq_(SqlScript(query).sql(), result) + + @staticmethod + def test_sanatization(): + """Sanatization of comments + """ + query = 'select 1 -- test connect \n;' + result = 'select 1;' + + eq_(SqlScript(query).sql(), result) + + @staticmethod + def test_multiple_queries(): + """Raise error if multiple queries are passed + """ + query = 'select 1; select 2;' + result = 'select 1;\nselect 2;' + eq_(SqlScript(query).sql(), result) + + @staticmethod + def test_empty_declaration(): + """Empty if no sql query is passed + """ + eq_(SqlScript().sql(), ';') + + @staticmethod + def test_length(): + """Length of sql script + """ + query = 'select 1; select 2;' + result = 2 + eq_(len(SqlScript(query)), result) + + @staticmethod + def test_append_statement(): + """Appending a statement to sql script + """ + script = SqlScript() + script.append(SqlStatement('Select 1')) + eq_(script.sql(), 'Select 1;') + + @staticmethod + def test_append_script(): + """Appending a script to sql script + """ + script = SqlScript('Select 1;') + script_new = SqlScript('Select 2;') + script.append(script_new) + eq_(script.sql(), 'Select 1;\nSelect 2;') + + @staticmethod + def test_append_string(): + """Appending a string to sql script + """ + script = SqlScript('Select 1;') + script.append('Select 2;') + eq_(script.sql(), 'Select 1;\nSelect 2;') + + @staticmethod + def test_copy(): + """Copy a sql script + """ + script = SqlScript('Select 1;') + script_new = script.copy() + eq_(script.sql(), script_new.sql()) + + # Check if it was a copy or the same object + assert_not_equal(id(script), id(script_new)) + + @staticmethod + def test_wrap_transaction(): + """Wrap the sql script in a transaction + """ + script = SqlScript('Select 1;').wrap_transaction() + result = 'BEGIN;\nSelect 1;\nCOMMIT;' + eq_(script.sql(), result) + + @staticmethod + def test_paranthesis(): + """Test sql with paranthesis is sanatized correctly + """ + script = SqlScript('create table test (session_id INTEGER);') + result = 'create table test (session_id INTEGER);' + eq_(script.sql(), result) diff --git a/parseql/sql/tests/test_sql_statement.py b/parseql/sql/tests/test_sql_statement.py index 3d0b684..9067bb8 100644 --- a/parseql/sql/tests/test_sql_statement.py +++ b/parseql/sql/tests/test_sql_statement.py @@ -15,7 +15,7 @@ def test_basic(): """Basic test for statement declaration """ query = 'select \n 1;' - result = 'select \n 1' + result = 'select 1' eq_(SqlStatement(query).sql(), result) diff --git a/parseql/sql/transaction.py b/parseql/sql/transaction.py index 4e0ae95..f1c3a9a 100644 --- a/parseql/sql/transaction.py +++ b/parseql/sql/transaction.py @@ -10,8 +10,7 @@ class BeginStatement(SqlStatement): def __init__(self): """Constructor for begin class """ - sql = 'BEGIN' - super(BeginStatement, self).__init__(sql) + super(BeginStatement, self).__init__('BEGIN') class CommitStatement(SqlStatement): @@ -20,5 +19,4 @@ class CommitStatement(SqlStatement): def __init__(self): """Constructor for Commit class """ - sql = 'COMMIT' - super(CommitStatement, self).__init__(sql) + super(CommitStatement, self).__init__('COMMIT') diff --git a/parseql/sql/utils.py b/parseql/sql/utils.py index e16b20b..1e92117 100644 --- a/parseql/sql/utils.py +++ b/parseql/sql/utils.py @@ -5,6 +5,7 @@ from ..parsers import remove_empty_statements from ..parsers import split_statements from ..parsers import remove_transactional +from ..parsers import remove_newlines def atmost_one(*args): @@ -16,14 +17,18 @@ def atmost_one(*args): return sum([1 for a in args if a is not None]) <= 1 -def sanatize_sql(sql): +def sanatize_sql(sql, keep_transaction=False): """Sanatize the sql string """ # remove comments string = remove_comments(sql) # remove transactionals - string = remove_transactional(string) + if not keep_transaction: + string = remove_transactional(string) + + # remove new lines + string = remove_newlines(string) # remove empty statements string = remove_empty_statements(string) From 65c9f53a6e3ca9ee310c7691c1a98657ac8f6214 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 9 Jan 2015 00:29:01 -0800 Subject: [PATCH 037/175] Create table parser --- parseql/parsers/create_table.py | 180 +++++++++++++++++++++ parseql/parsers/tests/test_create_table.py | 3 + parseql/parsers/transform.py | 7 +- parseql/parsers/utils.py | 55 +++++++ parseql/sql/sql_statement.py | 5 +- parseql/sql/tests/test_sql_script.py | 4 +- parseql/sql/transaction.py | 4 +- 7 files changed, 250 insertions(+), 8 deletions(-) create mode 100644 parseql/parsers/create_table.py create mode 100644 parseql/parsers/tests/test_create_table.py create mode 100644 parseql/parsers/utils.py diff --git a/parseql/parsers/create_table.py b/parseql/parsers/create_table.py new file mode 100644 index 0000000..50b57f4 --- /dev/null +++ b/parseql/parsers/create_table.py @@ -0,0 +1,180 @@ +"""Create SQL parser +""" +from pyparsing import alphanums +from pyparsing import CharsNotIn +from pyparsing import Combine +from pyparsing import delimitedList +from pyparsing import OneOrMore +from pyparsing import ParseException +from pyparsing import ParseResults +from pyparsing import Word +from pyparsing import ZeroOrMore + +from ..sql.sql_statement import SqlStatement + +from .utils import _all +from .utils import _create +from .utils import _db_name +from .utils import _distkey +from .utils import _diststyle +from .utils import _encode +from .utils import _even +from .utils import _foreign_key +from .utils import _key +from .utils import _not_null +from .utils import _null +from .utils import _references +from .utils import _sortkey +from .utils import _table +from .utils import column_types +from .utils import existance_check +from .utils import isNotEmpty +from .utils import pk_check +from .utils import temporary_check + + +def paranthesis_list(output_name, input_var=_db_name): + """Parser for a delimiedList enclosed in paranthesis + """ + return '(' + delimitedList(input_var).setResultsName(output_name) + ')' + + +def fk_reference(): + """Get Parser for foreign key references + """ + fk_reference_columns = paranthesis_list('fk_reference_columns') + fk_table = _db_name.setResultsName('fk_table') + return _references + fk_table + fk_reference_columns + + +def exists(parser, output_name): + """Get a parser that returns boolean on existance + """ + return parser.setParseAction(isNotEmpty).setResultsName(output_name) + + +def get_base_parser(): + """Get a pyparsing parser for a create table statement + + Returns: + table_definition(pyparsing): Parser for create table statements + """ + + temp_check = temporary_check.setResultsName('temporary') + exists_check = existance_check.setResultsName('if_exists') + + table_name = _db_name.setResultsName('table_name') + + # Initial portions of the table definition + def_start = _create + temp_check + _table + table_name + exists_check + + subquery = Combine('(' + ZeroOrMore(CharsNotIn(')')) + ')') + _word = Word(alphanums+"_-. ") + def_field = Combine(OneOrMore(_word | subquery)) + + table_def = def_start + paranthesis_list('raw_fields', def_field) + \ + get_attributes_parser() + + return table_def + + +def get_column_parser(): + """Get a pyparsing parser for a create table column field statement + + Returns: + column_definition(pyparsing): Parser for column definitions + """ + column_name = _db_name.setResultsName('column_name') + column_type = column_types.setResultsName('column_type') + + constraints = exists(_not_null, 'is_not_null') + constraints |= exists(_null, 'is_null') + constraints |= exists(pk_check, 'is_primary_key') + constraints |= exists(_distkey, 'is_distkey') + constraints |= exists(_sortkey, 'is_sortkey') + constraints |= fk_reference() + constraints |= _encode + _db_name.setResultsName('encoding') + + column_def = column_name + column_type + ZeroOrMore(constraints) + return column_def + + +def get_constraints_parser(): + """Get a pyparsing parser for a create table constraints field statement + + Returns: + constraints_definition(pyparsing): Parser for constraints definitions + """ + # Primary Key Constraints + def_pk = pk_check + paranthesis_list('pk_columns') + + # Foreign Key Constraints + def_fk = _foreign_key + paranthesis_list('fk_columns') + fk_reference() + + return def_pk | def_fk + + +def get_attributes_parser(): + """Get a pyparsing parser for a create table attributes + + Returns: + attribute_parser(pyparsing): Parser for attribute definitions + """ + diststyle_def = _diststyle + (_all | _even | _key).setResultsName( + 'diststyle') + + distkey_def = _distkey + paranthesis_list('distkey') + sortkey_def = _sortkey + paranthesis_list('sortkey') + + return OneOrMore(diststyle_def | sortkey_def | distkey_def) + + +def to_dict(input): + """Purge the ParseResults from output dictionary + """ + output = dict() + for key, value in input.asDict().iteritems(): + if isinstance(value, ParseResults): + output[key] = value.asList() + else: + output[key] = value + + return output + + +def parse_create_table(statement): + """Parse the create table sql query and return metadata + + Args: + statement(SqlStatement): Input sql statement that should be parsed + + Returns: + table_data(dict): table_data dictionary for instantiating a table object + """ + + if not isinstance(statement, SqlStatement): + raise ValueError('Input to table parser must of a SqlStatement object') + + string = statement.sql() + + # Parse the base table definitions + table_data = to_dict(get_base_parser().parseString(string)) + + # Parse the columns and append to the list + table_data['columns'] = list() + table_data['constraints'] = list() + + for field in table_data['raw_fields']: + try: + column = to_dict(get_column_parser().parseString(field)) + table_data['columns'].append(column) + except ParseException: + try: + constraint = to_dict( + get_constraints_parser().parseString(field)) + table_data['constraints'].append(constraint) + except ParseException: + print '[Error] : ', field + raise + + return table_data diff --git a/parseql/parsers/tests/test_create_table.py b/parseql/parsers/tests/test_create_table.py new file mode 100644 index 0000000..8b23c92 --- /dev/null +++ b/parseql/parsers/tests/test_create_table.py @@ -0,0 +1,3 @@ +"""Tests for create table parser +""" + diff --git a/parseql/parsers/transform.py b/parseql/parsers/transform.py index fc308ca..ee73953 100644 --- a/parseql/parsers/transform.py +++ b/parseql/parsers/transform.py @@ -13,6 +13,7 @@ from pyparsing import printables from pyparsing import replaceWith from pyparsing import Word +from pyparsing import WordStart from pyparsing import ZeroOrMore @@ -54,7 +55,7 @@ def remove_comments(string): string = multiline_comment.transformString(string) # Remove single line comments - singleline_comment = Literal('--') + ZeroOrMore(CharsNotIn('\n')) + singleline_comment = Literal('--') + ZeroOrMore(CharsNotIn('\n')) string = singleline_comment.suppress().transformString(string) return string @@ -69,7 +70,9 @@ def remove_transactional(string): Returns: result(str): String with begin and commit trimmed """ - transaction = (CaselessKeyword('BEGIN')| CaselessKeyword('COMMIT')) + transaction = WordStart() + ( + CaselessKeyword('BEGIN')| CaselessKeyword('COMMIT')) + return transaction.suppress().transformString(string) diff --git a/parseql/parsers/utils.py b/parseql/parsers/utils.py new file mode 100644 index 0000000..ddeef01 --- /dev/null +++ b/parseql/parsers/utils.py @@ -0,0 +1,55 @@ +"""SQL parser utils and constants +""" + +from pyparsing import alphanums +from pyparsing import CaselessKeyword +from pyparsing import Combine +from pyparsing import nums +from pyparsing import Optional +from pyparsing import Word + + +# Functions +isNotEmpty = lambda x: len(x) > 0 + +# Data types +_smallint = CaselessKeyword('SMALLINT') +_integer = CaselessKeyword('INTEGER') +_bigint = CaselessKeyword('BIGINT') +_decimal = Combine(CaselessKeyword('DECIMAL') + '(' + Word(nums + ',') + ')') +_real = (CaselessKeyword('REAL') | CaselessKeyword('FLOAT')) +_double = CaselessKeyword('DOUBLE') +_boolean = CaselessKeyword('BOOLEAN') +_char = CaselessKeyword('CHAR') +_varchar = Combine(CaselessKeyword('VARCHAR') + '(' + Word(nums) + ')') +_date = CaselessKeyword('DATE') +_timestamp = CaselessKeyword('TIMESTAMP') + +# Create SQL keywords +_create = CaselessKeyword('CREATE') +_table = CaselessKeyword('TABLE') +_temp = CaselessKeyword('TEMP') +_temporary = CaselessKeyword('TEMPORARY') +_if_not_exists = CaselessKeyword('IF NOT EXISTS') +_primary_key = CaselessKeyword('PRIMARY KEY') +_foreign_key = CaselessKeyword('FOREIGN KEY') +_references = CaselessKeyword('REFERENCES') +_unique = CaselessKeyword('UNIQUE') +_null = CaselessKeyword('NULL') +_not_null = CaselessKeyword('NOT NULL') +_distkey = CaselessKeyword('DISTKEY') +_diststyle = CaselessKeyword('DISTSTYLE') +_sortkey = CaselessKeyword('SORTKEY') +_encode = CaselessKeyword('ENCODE') +_all = CaselessKeyword('ALL') +_even = CaselessKeyword('EVEN') +_key = CaselessKeyword('KEY') + +# Parsers +_db_name = Word(alphanums+"_-.") +temporary_check = Optional(_temp | _temporary).setParseAction(isNotEmpty) +existance_check = Optional(_if_not_exists).setParseAction(isNotEmpty) +pk_check = (_primary_key | _unique) + +column_types = _smallint | _integer | _bigint | _decimal | _real | _double +column_types |= _boolean | _char | _varchar | _date | _timestamp diff --git a/parseql/sql/sql_statement.py b/parseql/sql/sql_statement.py index 3355836..eb9931e 100644 --- a/parseql/sql/sql_statement.py +++ b/parseql/sql/sql_statement.py @@ -7,12 +7,13 @@ class SqlStatement(object): """Class representing a single SQL statement """ - def __init__(self, sql=None): + def __init__(self, sql=None, transactional=False): """Constructor for the SqlStatement class """ if sql is None: sql = '' self._raw_sql = sql + self.transactional = transactional self._raw_statement = self._sanatize_sql() def __str__(self): @@ -31,7 +32,7 @@ def _sanatize_sql(self): if self._raw_sql is None: return '' - raw_statements = sanatize_sql(self._raw_sql, keep_transaction=True) + raw_statements = sanatize_sql(self._raw_sql, self.transactional) if len(raw_statements) > 1: raise ValueError('SQL Statement can not contain more than 1 query') diff --git a/parseql/sql/tests/test_sql_script.py b/parseql/sql/tests/test_sql_script.py index 25373eb..44e9d37 100644 --- a/parseql/sql/tests/test_sql_script.py +++ b/parseql/sql/tests/test_sql_script.py @@ -99,6 +99,6 @@ def test_wrap_transaction(): def test_paranthesis(): """Test sql with paranthesis is sanatized correctly """ - script = SqlScript('create table test (session_id INTEGER);') - result = 'create table test (session_id INTEGER);' + script = SqlScript('create table test_begin (session_id INTEGER);') + result = 'create table test_begin (session_id INTEGER);' eq_(script.sql(), result) diff --git a/parseql/sql/transaction.py b/parseql/sql/transaction.py index f1c3a9a..e0cacc7 100644 --- a/parseql/sql/transaction.py +++ b/parseql/sql/transaction.py @@ -10,7 +10,7 @@ class BeginStatement(SqlStatement): def __init__(self): """Constructor for begin class """ - super(BeginStatement, self).__init__('BEGIN') + super(BeginStatement, self).__init__('BEGIN', True) class CommitStatement(SqlStatement): @@ -19,4 +19,4 @@ class CommitStatement(SqlStatement): def __init__(self): """Constructor for Commit class """ - super(CommitStatement, self).__init__('COMMIT') + super(CommitStatement, self).__init__('COMMIT', True) From da2970d1a2a79c3903cfb3ab11a56822327973aa Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 10 Jan 2015 21:04:42 -0800 Subject: [PATCH 038/175] Table object first pass --- parseql/database/column.py | 59 +++++++++++++++ parseql/database/table.py | 122 ++++++++++++++++++++++++++++++++ parseql/parsers/create_table.py | 23 ++++-- parseql/sql/select_statement.py | 13 ++++ parseql/sql/utils.py | 9 +++ 5 files changed, 222 insertions(+), 4 deletions(-) create mode 100644 parseql/database/column.py create mode 100644 parseql/database/table.py create mode 100644 parseql/sql/select_statement.py diff --git a/parseql/database/column.py b/parseql/database/column.py new file mode 100644 index 0000000..fc6bb56 --- /dev/null +++ b/parseql/database/column.py @@ -0,0 +1,59 @@ +"""Script containing the column class object +""" + +class Column(object): + """Class representing columns in a table + """ + def __init__(self, column_name, column_type, encoding=None, + fk_reference=None, fk_table=None, is_distkey=False, + is_sortkey=False, is_primarykey=False, is_null=False, + is_not_null=False, position=None): + """Constructor for Column class + """ + + self.column_name = column_name + self.column_type = column_type + self.encoding = encoding + self.fk_reference = fk_reference + self.fk_table = fk_table + self.is_distkey = is_distkey + self.is_sortkey = is_sortkey + self.is_primarykey = is_primarykey + self.is_null = is_null + self.is_not_null = is_not_null + self.position = position + + if is_null and is_not_null: + raise ValueError('Column cannot be both NULL and NOT NULL together') + + if self.is_primarykey: + self.is_not_null = True + self.is_null = False + + def __str__(self): + """String output for the columns + """ + return ' '.join(self.column_name, self.column_type) + + @property + def primary(self): + """Property for the column being part of primary key + """ + return self.is_primarykey + + @primary.setter + def primary(self, value=True): + """Set the primary flag for the column + """ + self.is_primarykey = value + + # Force not null for primary key columns + if self.is_primarykey: + self.is_not_null = True + self.is_null = False + + @property + def name(self): + """Get the name of the column + """ + return self.column_name diff --git a/parseql/database/table.py b/parseql/database/table.py new file mode 100644 index 0000000..b605eac --- /dev/null +++ b/parseql/database/table.py @@ -0,0 +1,122 @@ +"""Script containing the table class object +""" +from copy import deepcopy + +from ..parsers.create_table import parse_create_table +from ..sql.sql_script import SqlScript +from .column import Column + + +class Table(object): + """Class representing tables in the database + """ + def __init__(self, sql): + """Constructor for Table class + """ + + if isinstance(sql, SqlScript): + # Take the first statement and ignore the rest + sql = SqlScript.statements[0] + + parameters = parse_create_table(sql) + + self.sql = sql + self.parameters = parameters + + self.full_name = parameters.get('full_name') + self.temporary = parameters.get('temporary') + self.exists_check = parameters.get('exists_check', False) + + self.sort_keys = parameters.get('sortkey', list()) + self.dist_keys = parameters.get('distkey', list()) + self.diststyle = parameters.get('diststyle', 'EVEN') + + self._constraints = parameters.get('constraints', list()) + + self._columns = dict() + for column_params in parameters.get('columns', list()): + column_name = column_params['column_name'] + self._columns[column_name] = Column(**column_params) + + self.schema_name, self.table_name = self.initialize_name() + self.update_attributes_from_columns() + self.update_columns_with_constrains() + + def __str__(self): + """Output for the print statement of the table + """ + return self.sql + + def copy(self): + """Create a copy of the Table object + """ + return deepcopy(self) + + def initialize_name(self): + """Parse the full name to declare the schema and table name + """ + split_name = self.full_name.split('.') + if len(split_name) == 2: + schema_name = split_name[0] + table_name = split_name[1] + else: + schema_name = None + table_name = self.full_name + + return schema_name, table_name + + def update_attributes_from_columns(self): + """ Update attributes sortkey and distkey based on columns + """ + distkeys = self.dist_keys + sortkeys = self.sort_keys + for column in self._columns.values(): + # Update the table attributes based on columns + if column.is_distkey: + distkeys.append(column.name) + if column.is_sortkey: + sortkeys.append(column.name) + + self.dist_keys = list(set(distkeys)) + self.sort_keys = list(set(sortkeys)) + + def update_columns_with_constrains(self): + """ Update columns with primary and foreign key constraints + """ + for constraint in self._constraints: + for col_name in constraint.get('pk_columns', list()): + self._columns[col_name].primary = True + + @property + def columns(self): + """Columns for the table + """ + return self._columns.values() + + @property + def primary_keys(self): + """Primary keys of the table + """ + return [c for c in self.columns if c.primary] + + def forign_key_references(self): + """Get a list of all foreign key references from the table + """ + result = list() + for column in self.columns: + if column.fk_table is not None: + result.append(( + [column.name], column.fk_table, column.fk_reference)) + + for constraint in self._constraints: + if 'fk_table' in constraint: + result.append((constraint.get('fk_columns'), + constraint.get('fk_table'), + constraint.get('fk_reference_columns'))) + return result + + @property + def dependencies(self): + """List of tables which this table references. + """ + return [table_name for _, table_name, _ in self.foreign_key_references] diff --git a/parseql/parsers/create_table.py b/parseql/parsers/create_table.py index 50b57f4..b807058 100644 --- a/parseql/parsers/create_table.py +++ b/parseql/parsers/create_table.py @@ -33,6 +33,9 @@ from .utils import temporary_check +FK_REFERENCE = 'fk_reference_columns' + + def paranthesis_list(output_name, input_var=_db_name): """Parser for a delimiedList enclosed in paranthesis """ @@ -42,7 +45,7 @@ def paranthesis_list(output_name, input_var=_db_name): def fk_reference(): """Get Parser for foreign key references """ - fk_reference_columns = paranthesis_list('fk_reference_columns') + fk_reference_columns = paranthesis_list(FK_REFERENCE) fk_table = _db_name.setResultsName('fk_table') return _references + fk_table + fk_reference_columns @@ -61,9 +64,9 @@ def get_base_parser(): """ temp_check = temporary_check.setResultsName('temporary') - exists_check = existance_check.setResultsName('if_exists') + exists_check = existance_check.setResultsName('exists_checks') - table_name = _db_name.setResultsName('table_name') + table_name = _db_name.setResultsName('full_name') # Initial portions of the table definition def_start = _create + temp_check + _table + table_name + exists_check @@ -89,7 +92,7 @@ def get_column_parser(): constraints = exists(_not_null, 'is_not_null') constraints |= exists(_null, 'is_null') - constraints |= exists(pk_check, 'is_primary_key') + constraints |= exists(pk_check, 'is_primarykey') constraints |= exists(_distkey, 'is_distkey') constraints |= exists(_sortkey, 'is_sortkey') constraints |= fk_reference() @@ -164,10 +167,22 @@ def parse_create_table(statement): table_data['columns'] = list() table_data['constraints'] = list() + column_position = 0 for field in table_data['raw_fields']: try: column = to_dict(get_column_parser().parseString(field)) + + # Add position of the column + column['position'] = column_position + column_position += 1 + + # Change fk_reference_column to string from list + if FK_REFERENCE in column: + column['fk_reference'] = column[FK_REFERENCE][0] + column.pop(FK_REFERENCE) + table_data['columns'].append(column) + except ParseException: try: constraint = to_dict( diff --git a/parseql/sql/select_statement.py b/parseql/sql/select_statement.py new file mode 100644 index 0000000..163d835 --- /dev/null +++ b/parseql/sql/select_statement.py @@ -0,0 +1,13 @@ +"""Script containing the SelectStatement object +""" + +from .sql_statement import SqlStatement + + +class SelectStatement(SqlStatement): + """Class representing SelectStatement from a sql_statement + """ + def __init__(self, sql): + """Constructor for CreateTableStatement class + """ + super(SelectStatement, self).__init__(sql) diff --git a/parseql/sql/utils.py b/parseql/sql/utils.py index 1e92117..bbec25a 100644 --- a/parseql/sql/utils.py +++ b/parseql/sql/utils.py @@ -17,6 +17,15 @@ def atmost_one(*args): return sum([1 for a in args if a is not None]) <= 1 +def exactly_one(*args): + """Asserts one of the arguments is not None + + Returns: + result(bool): True if exactly one of the arguments is not None + """ + return sum([1 for a in args if a is not None]) == 1 + + def sanatize_sql(sql, keep_transaction=False): """Sanatize the sql string """ From 58e2ec27600fe7da2b5ef347c40c66cbdb7e51e4 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 11 Jan 2015 12:37:13 -0800 Subject: [PATCH 039/175] Select Statement --- parseql/database/table.py | 36 +++++++++++-- parseql/parsers/create_table.py | 14 ++--- parseql/parsers/select_query.py | 96 +++++++++++++++++++++++++++++++++ parseql/parsers/utils.py | 12 +++++ parseql/sql/select_statement.py | 17 ++++++ 5 files changed, 161 insertions(+), 14 deletions(-) create mode 100644 parseql/parsers/select_query.py diff --git a/parseql/database/table.py b/parseql/database/table.py index b605eac..3b566ee 100644 --- a/parseql/database/table.py +++ b/parseql/database/table.py @@ -4,6 +4,7 @@ from ..parsers.create_table import parse_create_table from ..sql.sql_script import SqlScript +from ..sql.sql_statement import SqlStatement from .column import Column @@ -20,7 +21,7 @@ def __init__(self, sql): parameters = parse_create_table(sql) - self.sql = sql + self.sql_statement = sql self.parameters = parameters self.full_name = parameters.get('full_name') @@ -45,7 +46,7 @@ def __init__(self, sql): def __str__(self): """Output for the print statement of the table """ - return self.sql + return self.sql_statement def copy(self): """Create a copy of the Table object @@ -112,7 +113,7 @@ def forign_key_references(self): if 'fk_table' in constraint: result.append((constraint.get('fk_columns'), constraint.get('fk_table'), - constraint.get('fk_reference_columns'))) + constraint.get('fk_reference'))) return result @property @@ -120,3 +121,32 @@ def dependencies(self): """List of tables which this table references. """ return [table_name for _, table_name, _ in self.foreign_key_references] + + def temporary_clone_statement(self): + """Sql statement to create a temporary clone table + + Note: + The temporary table only copies the schema and not any data + """ + + # We don't need to use schema for temp tables + table_name = self.table_name + '_temp' + + # Create a list of column definitions + columns = ', '.join( + ['%s %s' %(c.column_name, c.column_type) for c in self.columns]) + + # We don't need any constraints to be specified on the temp table + sql = ['CREATE TEMPORARY TABLE %s ( %s )' % (table_name, columns)] + + return SqlStatement(sql) + + def drop_statement(self): + """Sql statment to drop the table + """ + return SqlStatement('DROP TABLE %s CASCADE' % self.full_name) + + def analyze_statement(self): + """Sql statment to analyze the table + """ + return SqlStatement('ANALYZE %s' % self.full_name) diff --git a/parseql/parsers/create_table.py b/parseql/parsers/create_table.py index b807058..c56e8b2 100644 --- a/parseql/parsers/create_table.py +++ b/parseql/parsers/create_table.py @@ -1,13 +1,9 @@ """Create SQL parser """ -from pyparsing import alphanums -from pyparsing import CharsNotIn -from pyparsing import Combine from pyparsing import delimitedList from pyparsing import OneOrMore from pyparsing import ParseException from pyparsing import ParseResults -from pyparsing import Word from pyparsing import ZeroOrMore from ..sql.sql_statement import SqlStatement @@ -27,13 +23,14 @@ from .utils import _sortkey from .utils import _table from .utils import column_types +from .utils import def_field from .utils import existance_check from .utils import isNotEmpty from .utils import pk_check from .utils import temporary_check -FK_REFERENCE = 'fk_reference_columns' +FK_REFERENCE = 'fk_reference' def paranthesis_list(output_name, input_var=_db_name): @@ -71,10 +68,6 @@ def get_base_parser(): # Initial portions of the table definition def_start = _create + temp_check + _table + table_name + exists_check - subquery = Combine('(' + ZeroOrMore(CharsNotIn(')')) + ')') - _word = Word(alphanums+"_-. ") - def_field = Combine(OneOrMore(_word | subquery)) - table_def = def_start + paranthesis_list('raw_fields', def_field) + \ get_attributes_parser() @@ -178,8 +171,7 @@ def parse_create_table(statement): # Change fk_reference_column to string from list if FK_REFERENCE in column: - column['fk_reference'] = column[FK_REFERENCE][0] - column.pop(FK_REFERENCE) + column[FK_REFERENCE] = column[FK_REFERENCE][0] table_data['columns'].append(column) diff --git a/parseql/parsers/select_query.py b/parseql/parsers/select_query.py new file mode 100644 index 0000000..2600e04 --- /dev/null +++ b/parseql/parsers/select_query.py @@ -0,0 +1,96 @@ +"""Select SQL parser +""" +from pyparsing import restOfLine +from pyparsing import MatchFirst +from pyparsing import delimitedList +from pyparsing import WordStart + +from ..sql.sql_statement import SqlStatement + +from .utils import _db_name +from .utils import _from +from .utils import _join +from .utils import _select +from .utils import def_field + + +def parse_select_base(statement): + """Parse a select query and return the dependencies + + Args: + statement(SqlStatement): Input sql statement that should be parsed + + Returns: + result(list of str): List of dependent tables + """ + + if not isinstance(statement, SqlStatement): + raise ValueError('Input to table parser must of a SqlStatement object') + + string = statement.sql() + + if string == '': + return + + base_parser = _select + restOfLine + + # Sanity check that query starts with select + base_parser.parseString(string) + + +def parse_select_dependencies(statement): + """Parse a select query and return the dependencies + + Args: + statement(SqlStatement): Input sql statement that should be parsed + + Returns: + result(list of str): List of dependent tables + """ + + if not isinstance(statement, SqlStatement): + raise ValueError('Input to table parser must of a SqlStatement object') + + string = statement.sql() + + if string == '': + return list() + + # Find all dependent tables + dep_parse = WordStart() + (_from | _join) + _db_name.setResultsName('table') + output = dep_parse.setParseAction(lambda x: x.table).searchString(string) + + # Flatten the list before returning + flattened_output = [item for sublist in output for item in sublist] + + # Deduplicated the list + return list(set(flattened_output)) + + +def parse_select_columns(statement): + """Parse a select query and return the columns + + Args: + statement(SqlStatement): Input sql statement that should be parsed + + Returns: + result(list of str): List of columns + """ + + if not isinstance(statement, SqlStatement): + raise ValueError('Input to table parser must of a SqlStatement object') + + string = statement.sql() + + if string == '': + return list() + + # Supress everything after the first from + suppressor = MatchFirst(_from) + restOfLine + string = suppressor.suppress().transformString(string) + + parser = _select + delimitedList(def_field).setResultsName('columns') + output = parser.parseString(string).columns.asList() + + # Strip extra whitespace from the string + return [column.strip() for column in output] diff --git a/parseql/parsers/utils.py b/parseql/parsers/utils.py index ddeef01..32e3f6b 100644 --- a/parseql/parsers/utils.py +++ b/parseql/parsers/utils.py @@ -3,6 +3,9 @@ from pyparsing import alphanums from pyparsing import CaselessKeyword +from pyparsing import CharsNotIn +from pyparsing import OneOrMore +from pyparsing import ZeroOrMore from pyparsing import Combine from pyparsing import nums from pyparsing import Optional @@ -45,6 +48,11 @@ _even = CaselessKeyword('EVEN') _key = CaselessKeyword('KEY') +# Select SQL Keywords +_select = CaselessKeyword('SELECT') +_from = CaselessKeyword('FROM') +_join = CaselessKeyword('JOIN') + # Parsers _db_name = Word(alphanums+"_-.") temporary_check = Optional(_temp | _temporary).setParseAction(isNotEmpty) @@ -53,3 +61,7 @@ column_types = _smallint | _integer | _bigint | _decimal | _real | _double column_types |= _boolean | _char | _varchar | _date | _timestamp + +subquery = Combine('(' + ZeroOrMore(CharsNotIn(')')) + ')') +_word = Word(alphanums+"_-. *") +def_field = Combine(OneOrMore(_word | subquery)) diff --git a/parseql/sql/select_statement.py b/parseql/sql/select_statement.py index 163d835..e7efc3c 100644 --- a/parseql/sql/select_statement.py +++ b/parseql/sql/select_statement.py @@ -2,6 +2,8 @@ """ from .sql_statement import SqlStatement +from ..parsers.select_query import parse_select_dependencies +from ..parsers.select_query import parse_select_columns class SelectStatement(SqlStatement): @@ -11,3 +13,18 @@ def __init__(self, sql): """Constructor for CreateTableStatement class """ super(SelectStatement, self).__init__(sql) + + self._dependencies = parse_select_dependencies(self.sql()) + self._columns = parse_select_columns(self.sql()) + + @property + def dependencies(self): + """Table dependencies of the select statement + """ + return self._dependencies + + @property + def columns(self): + """Table columns of the select statement + """ + return self._columns From 14f46a4e7187fd6ea420fc01cd9e67b5d68cd5b9 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 11 Jan 2015 15:57:49 -0800 Subject: [PATCH 040/175] Fix for migration from independent repo --- dataduct/config/config.py | 4 ++-- dataduct/database/__init__.py | 1 + {parseql => dataduct}/database/column.py | 0 .../database}/parsers/__init__.py | 5 +++++ .../database}/parsers/create_table.py | 2 +- .../database}/parsers/select_query.py | 14 -------------- .../database/parsers/tests}/__init__.py | 0 .../parsers/tests/test_create_table.py | 0 .../database}/parsers/tests/test_transfrom.py | 0 .../database}/parsers/transform.py | 0 .../database}/parsers/utils.py | 0 dataduct/database/sql/__init__.py | 3 +++ .../database}/sql/select_statement.py | 4 ++-- .../database}/sql/sql_script.py | 3 ++- .../database}/sql/sql_statement.py | 0 .../database/sql}/tests/__init__.py | 0 .../database}/sql/tests/test_sql_script.py | 0 .../database}/sql/tests/test_sql_statement.py | 0 .../database}/sql/transaction.py | 0 {parseql => dataduct/database}/sql/utils.py | 18 ------------------ {parseql => dataduct}/database/table.py | 6 +++--- .../database}/tests/__init__.py | 0 dataduct/utils/helpers.py | 9 +++++++++ parseql/__init__.py | 4 ---- parseql/sql/__init__.py | 0 parseql/sql/tests/__init__.py | 0 26 files changed, 28 insertions(+), 45 deletions(-) create mode 100644 dataduct/database/__init__.py rename {parseql => dataduct}/database/column.py (100%) rename {parseql => dataduct/database}/parsers/__init__.py (58%) rename {parseql => dataduct/database}/parsers/create_table.py (99%) rename {parseql => dataduct/database}/parsers/select_query.py (83%) rename {parseql/database => dataduct/database/parsers/tests}/__init__.py (100%) rename {parseql => dataduct/database}/parsers/tests/test_create_table.py (100%) rename {parseql => dataduct/database}/parsers/tests/test_transfrom.py (100%) rename {parseql => dataduct/database}/parsers/transform.py (100%) rename {parseql => dataduct/database}/parsers/utils.py (100%) create mode 100644 dataduct/database/sql/__init__.py rename {parseql => dataduct/database}/sql/select_statement.py (86%) rename {parseql => dataduct/database}/sql/sql_script.py (98%) rename {parseql => dataduct/database}/sql/sql_statement.py (100%) rename {parseql/database => dataduct/database/sql}/tests/__init__.py (100%) rename {parseql => dataduct/database}/sql/tests/test_sql_script.py (100%) rename {parseql => dataduct/database}/sql/tests/test_sql_statement.py (100%) rename {parseql => dataduct/database}/sql/transaction.py (100%) rename {parseql => dataduct/database}/sql/utils.py (60%) rename {parseql => dataduct}/database/table.py (97%) rename {parseql/parsers => dataduct/database}/tests/__init__.py (100%) delete mode 100644 parseql/__init__.py delete mode 100644 parseql/sql/__init__.py delete mode 100644 parseql/sql/tests/__init__.py diff --git a/dataduct/config/config.py b/dataduct/config/config.py index c1e2cac..dafc336 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -14,8 +14,8 @@ def get_config_files(): 3. DATADUCT_PATH environment variable """ dataduct_config_path = '/etc/dataduct.cfg' - dataduct_user_config_path = os.path.join( - os.path.expanduser('~'),'.dataduct')) + dataduct_user_config_path = os.path.join(os.path.expanduser('~'), + '.dataduct') config_files = [dataduct_config_path, dataduct_user_config_path] # Check DATADUCT_PATH env variable for other configuration locations diff --git a/dataduct/database/__init__.py b/dataduct/database/__init__.py new file mode 100644 index 0000000..0a813ff --- /dev/null +++ b/dataduct/database/__init__.py @@ -0,0 +1 @@ +from .table import Table \ No newline at end of file diff --git a/parseql/database/column.py b/dataduct/database/column.py similarity index 100% rename from parseql/database/column.py rename to dataduct/database/column.py diff --git a/parseql/parsers/__init__.py b/dataduct/database/parsers/__init__.py similarity index 58% rename from parseql/parsers/__init__.py rename to dataduct/database/parsers/__init__.py index 5b4fe6d..fc973a5 100644 --- a/parseql/parsers/__init__.py +++ b/dataduct/database/parsers/__init__.py @@ -3,3 +3,8 @@ from .transform import remove_transactional from .transform import split_statements from .transform import remove_newlines + +from .select_query import parse_select_dependencies +from .select_query import parse_select_columns + +from .create_table import parse_create_table diff --git a/parseql/parsers/create_table.py b/dataduct/database/parsers/create_table.py similarity index 99% rename from parseql/parsers/create_table.py rename to dataduct/database/parsers/create_table.py index c56e8b2..025659a 100644 --- a/parseql/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -6,7 +6,7 @@ from pyparsing import ParseResults from pyparsing import ZeroOrMore -from ..sql.sql_statement import SqlStatement +from ..sql import SqlStatement from .utils import _all from .utils import _create diff --git a/parseql/parsers/select_query.py b/dataduct/database/parsers/select_query.py similarity index 83% rename from parseql/parsers/select_query.py rename to dataduct/database/parsers/select_query.py index 2600e04..ba14012 100644 --- a/parseql/parsers/select_query.py +++ b/dataduct/database/parsers/select_query.py @@ -5,8 +5,6 @@ from pyparsing import delimitedList from pyparsing import WordStart -from ..sql.sql_statement import SqlStatement - from .utils import _db_name from .utils import _from from .utils import _join @@ -23,10 +21,6 @@ def parse_select_base(statement): Returns: result(list of str): List of dependent tables """ - - if not isinstance(statement, SqlStatement): - raise ValueError('Input to table parser must of a SqlStatement object') - string = statement.sql() if string == '': @@ -47,10 +41,6 @@ def parse_select_dependencies(statement): Returns: result(list of str): List of dependent tables """ - - if not isinstance(statement, SqlStatement): - raise ValueError('Input to table parser must of a SqlStatement object') - string = statement.sql() if string == '': @@ -76,10 +66,6 @@ def parse_select_columns(statement): Returns: result(list of str): List of columns """ - - if not isinstance(statement, SqlStatement): - raise ValueError('Input to table parser must of a SqlStatement object') - string = statement.sql() if string == '': diff --git a/parseql/database/__init__.py b/dataduct/database/parsers/tests/__init__.py similarity index 100% rename from parseql/database/__init__.py rename to dataduct/database/parsers/tests/__init__.py diff --git a/parseql/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py similarity index 100% rename from parseql/parsers/tests/test_create_table.py rename to dataduct/database/parsers/tests/test_create_table.py diff --git a/parseql/parsers/tests/test_transfrom.py b/dataduct/database/parsers/tests/test_transfrom.py similarity index 100% rename from parseql/parsers/tests/test_transfrom.py rename to dataduct/database/parsers/tests/test_transfrom.py diff --git a/parseql/parsers/transform.py b/dataduct/database/parsers/transform.py similarity index 100% rename from parseql/parsers/transform.py rename to dataduct/database/parsers/transform.py diff --git a/parseql/parsers/utils.py b/dataduct/database/parsers/utils.py similarity index 100% rename from parseql/parsers/utils.py rename to dataduct/database/parsers/utils.py diff --git a/dataduct/database/sql/__init__.py b/dataduct/database/sql/__init__.py new file mode 100644 index 0000000..24a8cdf --- /dev/null +++ b/dataduct/database/sql/__init__.py @@ -0,0 +1,3 @@ +from .sql_statement import SqlStatement +from .sql_script import SqlScript +from .select_statement import SelectStatement diff --git a/parseql/sql/select_statement.py b/dataduct/database/sql/select_statement.py similarity index 86% rename from parseql/sql/select_statement.py rename to dataduct/database/sql/select_statement.py index e7efc3c..c11e539 100644 --- a/parseql/sql/select_statement.py +++ b/dataduct/database/sql/select_statement.py @@ -2,8 +2,8 @@ """ from .sql_statement import SqlStatement -from ..parsers.select_query import parse_select_dependencies -from ..parsers.select_query import parse_select_columns +from ..parsers import parse_select_dependencies +from ..parsers import parse_select_columns class SelectStatement(SqlStatement): diff --git a/parseql/sql/sql_script.py b/dataduct/database/sql/sql_script.py similarity index 98% rename from parseql/sql/sql_script.py rename to dataduct/database/sql/sql_script.py index 6a31476..a8cc254 100644 --- a/parseql/sql/sql_script.py +++ b/dataduct/database/sql/sql_script.py @@ -5,9 +5,10 @@ from .sql_statement import SqlStatement from .transaction import BeginStatement from .transaction import CommitStatement -from .utils import atmost_one from .utils import sanatize_sql +from ...utils.helpers import atmost_one + class SqlScript(object): """Class representing a single SQL Script diff --git a/parseql/sql/sql_statement.py b/dataduct/database/sql/sql_statement.py similarity index 100% rename from parseql/sql/sql_statement.py rename to dataduct/database/sql/sql_statement.py diff --git a/parseql/database/tests/__init__.py b/dataduct/database/sql/tests/__init__.py similarity index 100% rename from parseql/database/tests/__init__.py rename to dataduct/database/sql/tests/__init__.py diff --git a/parseql/sql/tests/test_sql_script.py b/dataduct/database/sql/tests/test_sql_script.py similarity index 100% rename from parseql/sql/tests/test_sql_script.py rename to dataduct/database/sql/tests/test_sql_script.py diff --git a/parseql/sql/tests/test_sql_statement.py b/dataduct/database/sql/tests/test_sql_statement.py similarity index 100% rename from parseql/sql/tests/test_sql_statement.py rename to dataduct/database/sql/tests/test_sql_statement.py diff --git a/parseql/sql/transaction.py b/dataduct/database/sql/transaction.py similarity index 100% rename from parseql/sql/transaction.py rename to dataduct/database/sql/transaction.py diff --git a/parseql/sql/utils.py b/dataduct/database/sql/utils.py similarity index 60% rename from parseql/sql/utils.py rename to dataduct/database/sql/utils.py index bbec25a..ecadab4 100644 --- a/parseql/sql/utils.py +++ b/dataduct/database/sql/utils.py @@ -8,24 +8,6 @@ from ..parsers import remove_newlines -def atmost_one(*args): - """Asserts one of the arguments is not None - - Returns: - result(bool): True if exactly one of the arguments is not None - """ - return sum([1 for a in args if a is not None]) <= 1 - - -def exactly_one(*args): - """Asserts one of the arguments is not None - - Returns: - result(bool): True if exactly one of the arguments is not None - """ - return sum([1 for a in args if a is not None]) == 1 - - def sanatize_sql(sql, keep_transaction=False): """Sanatize the sql string """ diff --git a/parseql/database/table.py b/dataduct/database/table.py similarity index 97% rename from parseql/database/table.py rename to dataduct/database/table.py index 3b566ee..8527d93 100644 --- a/parseql/database/table.py +++ b/dataduct/database/table.py @@ -2,9 +2,9 @@ """ from copy import deepcopy -from ..parsers.create_table import parse_create_table -from ..sql.sql_script import SqlScript -from ..sql.sql_statement import SqlStatement +from .parsers import parse_create_table +from .sql import SqlScript +from .sql import SqlStatement from .column import Column diff --git a/parseql/parsers/tests/__init__.py b/dataduct/database/tests/__init__.py similarity index 100% rename from parseql/parsers/tests/__init__.py rename to dataduct/database/tests/__init__.py diff --git a/dataduct/utils/helpers.py b/dataduct/utils/helpers.py index 8d53ae6..4162f49 100644 --- a/dataduct/utils/helpers.py +++ b/dataduct/utils/helpers.py @@ -12,6 +12,15 @@ CUSTOM_STEPS_PATH = 'CUSTOM_STEPS_PATH' +def atmost_one(*args): + """Asserts one of the arguments is not None + + Returns: + result(bool): True if exactly one of the arguments is not None + """ + return sum([1 for a in args if a is not None]) <= 1 + + def exactly_one(*args): """Asserts one of the arguments is not None diff --git a/parseql/__init__.py b/parseql/__init__.py deleted file mode 100644 index 03b2b5e..0000000 --- a/parseql/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -"""Welcome to Parseql -""" -__version__ = '0.1.0' -__import__('pkg_resources').declare_namespace(__name__) diff --git a/parseql/sql/__init__.py b/parseql/sql/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/parseql/sql/tests/__init__.py b/parseql/sql/tests/__init__.py deleted file mode 100644 index e69de29..0000000 From cd1799c3bd39d672bb5aebf1448e0058243feb5f Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 11 Jan 2015 23:42:15 -0800 Subject: [PATCH 041/175] View Object --- dataduct/database/__init__.py | 5 +- dataduct/database/parsers/__init__.py | 1 + dataduct/database/parsers/create_table.py | 36 ++---------- dataduct/database/parsers/create_view.py | 49 ++++++++++++++++ dataduct/database/parsers/helpers.py | 45 ++++++++++++++ dataduct/database/parsers/utils.py | 9 +-- dataduct/database/sql/utils.py | 14 +++++ dataduct/database/table.py | 19 +++--- dataduct/database/view.py | 71 +++++++++++++++++++++++ dataduct/steps/extract_rds.py | 20 +------ 10 files changed, 205 insertions(+), 64 deletions(-) create mode 100644 dataduct/database/parsers/create_view.py create mode 100644 dataduct/database/parsers/helpers.py create mode 100644 dataduct/database/view.py diff --git a/dataduct/database/__init__.py b/dataduct/database/__init__.py index 0a813ff..df09213 100644 --- a/dataduct/database/__init__.py +++ b/dataduct/database/__init__.py @@ -1 +1,4 @@ -from .table import Table \ No newline at end of file +from .table import Table +from .sql import SqlStatement +from .sql import SqlScript +from .sql import SelectStatement diff --git a/dataduct/database/parsers/__init__.py b/dataduct/database/parsers/__init__.py index fc973a5..aa967b7 100644 --- a/dataduct/database/parsers/__init__.py +++ b/dataduct/database/parsers/__init__.py @@ -8,3 +8,4 @@ from .select_query import parse_select_columns from .create_table import parse_create_table +from .create_view import parse_create_view diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index 025659a..b07c8fb 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -1,9 +1,7 @@ """Create SQL parser """ -from pyparsing import delimitedList from pyparsing import OneOrMore from pyparsing import ParseException -from pyparsing import ParseResults from pyparsing import ZeroOrMore from ..sql import SqlStatement @@ -24,19 +22,16 @@ from .utils import _table from .utils import column_types from .utils import def_field -from .utils import existance_check -from .utils import isNotEmpty from .utils import pk_check -from .utils import temporary_check - -FK_REFERENCE = 'fk_reference' +from .helpers import existance_check +from .helpers import exists +from .helpers import paranthesis_list +from .helpers import temporary_check +from .helpers import to_dict -def paranthesis_list(output_name, input_var=_db_name): - """Parser for a delimiedList enclosed in paranthesis - """ - return '(' + delimitedList(input_var).setResultsName(output_name) + ')' +FK_REFERENCE = 'fk_reference' def fk_reference(): @@ -47,12 +42,6 @@ def fk_reference(): return _references + fk_table + fk_reference_columns -def exists(parser, output_name): - """Get a parser that returns boolean on existance - """ - return parser.setParseAction(isNotEmpty).setResultsName(output_name) - - def get_base_parser(): """Get a pyparsing parser for a create table statement @@ -125,19 +114,6 @@ def get_attributes_parser(): return OneOrMore(diststyle_def | sortkey_def | distkey_def) -def to_dict(input): - """Purge the ParseResults from output dictionary - """ - output = dict() - for key, value in input.asDict().iteritems(): - if isinstance(value, ParseResults): - output[key] = value.asList() - else: - output[key] = value - - return output - - def parse_create_table(statement): """Parse the create table sql query and return metadata diff --git a/dataduct/database/parsers/create_view.py b/dataduct/database/parsers/create_view.py new file mode 100644 index 0000000..78879af --- /dev/null +++ b/dataduct/database/parsers/create_view.py @@ -0,0 +1,49 @@ +"""Create SQL parser +""" +from pyparsing import Group +from pyparsing import printables +from pyparsing import StringEnd +from pyparsing import Word +from pyparsing import ZeroOrMore + +from ..sql import SqlStatement + +from .utils import _create +from .utils import _view +from .utils import _db_name +from .utils import _as + +from .helpers import to_dict +from .helpers import replace_check + + +merge = lambda x: ' '.join(x[0]) + + +def parse_create_view(statement): + """Parse the create view sql query and return metadata + + Args: + statement(SqlStatement): Input sql statement that should be parsed + + Returns: + view_data(dict): view_data dictionary for instantiating a view object + """ + + if not isinstance(statement, SqlStatement): + raise ValueError('Input to view parser must of a SqlStatement object') + + string = statement.sql() + + end = ')' + StringEnd() + select = Group(ZeroOrMore(~end + Word(printables))) + + parser = _create + replace_check.setResultsName('replace') + _view + parser += _db_name.setResultsName('view_name') + _as + '(' + parser += select.setParseAction(merge).setResultsName('select_statement') + parser += end + + # Parse the base table definitions + view_data = to_dict(parser.parseString(string)) + + return view_data diff --git a/dataduct/database/parsers/helpers.py b/dataduct/database/parsers/helpers.py new file mode 100644 index 0000000..74068c5 --- /dev/null +++ b/dataduct/database/parsers/helpers.py @@ -0,0 +1,45 @@ +"""SQL parser helpers +""" +from pyparsing import delimitedList +from pyparsing import Optional +from pyparsing import ParseResults + +from .utils import _db_name +from .utils import _temp +from .utils import _temporary +from .utils import _if_not_exists +from .utils import _or_replace + +# Functions +isNotEmpty = lambda x: len(x) > 0 + +temporary_check = Optional(_temp | _temporary).setParseAction(isNotEmpty) + +replace_check = Optional(_or_replace).setParseAction(isNotEmpty) + +existance_check = Optional(_if_not_exists).setParseAction(isNotEmpty) + + +def paranthesis_list(output_name, input_var=_db_name): + """Parser for a delimiedList enclosed in paranthesis + """ + return '(' + delimitedList(input_var).setResultsName(output_name) + ')' + + +def exists(parser, output_name): + """Get a parser that returns boolean on existance + """ + return parser.setParseAction(isNotEmpty).setResultsName(output_name) + + +def to_dict(input): + """Purge the ParseResults from output dictionary + """ + output = dict() + for key, value in input.asDict().iteritems(): + if isinstance(value, ParseResults): + output[key] = value.asList() + else: + output[key] = value + + return output diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index 32e3f6b..3360f4b 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -8,13 +8,9 @@ from pyparsing import ZeroOrMore from pyparsing import Combine from pyparsing import nums -from pyparsing import Optional from pyparsing import Word -# Functions -isNotEmpty = lambda x: len(x) > 0 - # Data types _smallint = CaselessKeyword('SMALLINT') _integer = CaselessKeyword('INTEGER') @@ -31,9 +27,11 @@ # Create SQL keywords _create = CaselessKeyword('CREATE') _table = CaselessKeyword('TABLE') +_view = CaselessKeyword('VIEW') _temp = CaselessKeyword('TEMP') _temporary = CaselessKeyword('TEMPORARY') _if_not_exists = CaselessKeyword('IF NOT EXISTS') +_or_replace = CaselessKeyword('OR REPLACE') _primary_key = CaselessKeyword('PRIMARY KEY') _foreign_key = CaselessKeyword('FOREIGN KEY') _references = CaselessKeyword('REFERENCES') @@ -51,12 +49,11 @@ # Select SQL Keywords _select = CaselessKeyword('SELECT') _from = CaselessKeyword('FROM') +_as = CaselessKeyword('AS') _join = CaselessKeyword('JOIN') # Parsers _db_name = Word(alphanums+"_-.") -temporary_check = Optional(_temp | _temporary).setParseAction(isNotEmpty) -existance_check = Optional(_if_not_exists).setParseAction(isNotEmpty) pk_check = (_primary_key | _unique) column_types = _smallint | _integer | _bigint | _decimal | _real | _double diff --git a/dataduct/database/sql/utils.py b/dataduct/database/sql/utils.py index ecadab4..c179bd5 100644 --- a/dataduct/database/sql/utils.py +++ b/dataduct/database/sql/utils.py @@ -8,6 +8,20 @@ from ..parsers import remove_newlines +def balanced_parenthesis(statement): + """Check if the SQL statement is balanced + """ + counter = 0 + for character in statement: + if character == '(': + counter += 1 + if character == ')': + counter -= 1 + if counter < 0: + return False + return counter == 0 + + def sanatize_sql(sql, keep_transaction=False): """Sanatize the sql string """ diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 8527d93..80fe9fc 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -4,7 +4,6 @@ from .parsers import parse_create_table from .sql import SqlScript -from .sql import SqlStatement from .column import Column @@ -122,8 +121,8 @@ def dependencies(self): """ return [table_name for _, table_name, _ in self.foreign_key_references] - def temporary_clone_statement(self): - """Sql statement to create a temporary clone table + def temporary_clone_script(self): + """Sql script to create a temporary clone table Note: The temporary table only copies the schema and not any data @@ -139,14 +138,14 @@ def temporary_clone_statement(self): # We don't need any constraints to be specified on the temp table sql = ['CREATE TEMPORARY TABLE %s ( %s )' % (table_name, columns)] - return SqlStatement(sql) + return SqlScript(sql) - def drop_statement(self): - """Sql statment to drop the table + def drop_script(self): + """Sql script to drop the table """ - return SqlStatement('DROP TABLE %s CASCADE' % self.full_name) + return SqlScript('DROP TABLE %s CASCADE' % self.full_name) - def analyze_statement(self): - """Sql statment to analyze the table + def analyze_script(self): + """Sql script to analyze the table """ - return SqlStatement('ANALYZE %s' % self.full_name) + return SqlScript('ANALYZE %s' % self.full_name) diff --git a/dataduct/database/view.py b/dataduct/database/view.py new file mode 100644 index 0000000..44fa6cd --- /dev/null +++ b/dataduct/database/view.py @@ -0,0 +1,71 @@ +"""Script containing the view class object +""" +from copy import deepcopy + +from .parsers import parse_create_view +from .sql import SqlScript +from .sql import SelectStatement + + +class View(object): + """Class representing view in the database + """ + def __init__(self, sql): + """Constructor for view class + """ + + if isinstance(sql, SqlScript): + # Take the first statement and ignore the rest + sql = SqlScript.statements[0] + + parameters = parse_create_view(sql) + + self.sql_statement = sql + self.parameters = parameters + + self.full_name = parameters.get('view_name') + self.replace_flag = parameters.get('replace', False) + + self.select_statement = SelectStatement(parameters.get('select_statement')) + + self.schema_name, self.view_name = self.initialize_name() + + def __str__(self): + """Output for the print statement of the view + """ + return self.sql_statement + + def copy(self): + """Create a copy of the view object + """ + return deepcopy(self) + + def initialize_name(self): + """Parse the full name to declare the schema and view name + """ + split_name = self.full_name.split('.') + if len(split_name) == 2: + schema_name = split_name[0] + view_name = split_name[1] + else: + schema_name = None + view_name = self.view_name + + return schema_name, view_name + + @property + def dependencies(self): + """List of relations which this view references. + """ + return self.select_statement.dependencies + + @property + def columns(self): + """List of columns in the view's select statement + """ + return self.select_statement.columns + + def drop_script(self): + """Sql script to drop the view + """ + return SqlScript('DROP VIEW %s CASCADE' % self.full_name) diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index e7c9107..c33d023 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -1,8 +1,6 @@ """ ETL step wrapper to extract data from RDS to S3 """ -from re import findall - from ..config import Config from .etl_step import ETLStep from ..pipeline import CopyActivity @@ -11,6 +9,7 @@ from ..pipeline import ShellCommandActivity from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError +from ..database import SelectStatement config = Config() if not hasattr(config, 'mysql'): @@ -19,19 +18,6 @@ MYSQL_CONFIG = config.mysql -def guess_input_tables(sql): - """Guess input tables from the sql query - - Returns: - results(list of str): tables which are used in the sql statement - """ - results = findall(r'from ([A-Za-z0-9._]+)', sql) - results.extend(findall(r'FROM ([A-Za-z0-9._]+)', sql)) - results.extend(findall(r'join ([A-Za-z0-9._]+)', sql)) - results.extend(findall(r'JOIN ([A-Za-z0-9._]+)', sql)) - return list(set(results)) - - class ExtractRdsStep(ETLStep): """Extract Redshift Step class that helps get data out of redshift """ @@ -58,9 +44,9 @@ def __init__(self, super(ExtractRdsStep, self).__init__(**kwargs) if table: - sql = 'select * from %s;' % table + sql = 'SELECT * FROM %s;' % table elif sql: - table = guess_input_tables(sql) + table = SelectStatement(sql).dependencies else: raise ETLInputError('Provide a sql statement or a table name') From 99d4011977f7959a02e8462eadded680c1f0a4ee Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 12 Jan 2015 00:19:37 -0800 Subject: [PATCH 042/175] Relation Object --- dataduct/database/relation.py | 84 ++++++++++++++++++++++++++ dataduct/database/sql/sql_statement.py | 7 ++- dataduct/database/table.py | 29 +-------- dataduct/database/view.py | 30 +-------- dataduct/utils/helpers.py | 9 +++ 5 files changed, 105 insertions(+), 54 deletions(-) create mode 100644 dataduct/database/relation.py diff --git a/dataduct/database/relation.py b/dataduct/database/relation.py new file mode 100644 index 0000000..4223008 --- /dev/null +++ b/dataduct/database/relation.py @@ -0,0 +1,84 @@ +"""Script containing the relation class object +""" +from copy import deepcopy +from .sql import SqlScript +from ..config import Config +from ..utils.helpers import atleast_one + + +class Relation(object): + """Class representing a relation in the database + """ + + def __str__(self): + """Output for the print statement of the relation + """ + return self.sql_statement + + def copy(self): + """Create a copy of the relation object + """ + return deepcopy(self) + + def initialize_name(self): + """Parse the full name to declare the schema and relation name + """ + split_name = self.full_name.split('.') + if len(split_name) == 2: + schema_name = split_name[0] + relation_name = split_name[1] + else: + schema_name = None + relation_name = self.full_name + + return schema_name, relation_name + + def _grant_sql_builder(self, permission, user=None, group=None): + """Return the sql string for granting permissions + """ + if not atleast_one(user, group): + raise ValueError('Atleast one of user / group needed') + + result = list() + base = 'GRANT %s ON %s TO ' % (permission, self.full_name) + + if user is not None: + result.append(base + user) + + if group is not None: + result.append(base + 'GROUP %s' % group) + + def grant_script(self): + """Grant the permissions based on the config + """ + config = Config() + if not hasattr(config, 'database'): + return + + permissions = config.database.get('permissions', list()) + + sql = list() + for permission in permissions: + sql.extend(self._grant_sql_builder(**permission)) + + return SqlScript('; '.join(sql)) + + def select_script(self): + """Select everything from the relation + """ + return SqlScript('SELECT * FROM %s' % self.full_name) + + def create_script(self, grant_permissions=True): + """Create script for the table object + """ + script = SqlScript(statements=[self.sql_statement.copy()]) + if grant_permissions: + script.append(self.grant_script()) + return script + + def recreate_script(self): + """Sql script to recreate the view + """ + script = self.drop_script() + script.append(self.create_script()) + return script diff --git a/dataduct/database/sql/sql_statement.py b/dataduct/database/sql/sql_statement.py index eb9931e..61bdcb4 100644 --- a/dataduct/database/sql/sql_statement.py +++ b/dataduct/database/sql/sql_statement.py @@ -1,6 +1,6 @@ """Script that contains the sql statement class """ - +from copy import deepcopy from .utils import sanatize_sql @@ -21,6 +21,11 @@ def __str__(self): """ return self.sql() + def copy(self): + """Create a copy of the relation object + """ + return deepcopy(self) + def sql(self): """Returns the raw_sql for the SqlStatement """ diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 80fe9fc..f7fc7fb 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -1,18 +1,18 @@ """Script containing the table class object """ -from copy import deepcopy - from .parsers import parse_create_table from .sql import SqlScript from .column import Column +from .relation import Relation -class Table(object): +class Table(Relation): """Class representing tables in the database """ def __init__(self, sql): """Constructor for Table class """ + super(Table, self).__init__() if isinstance(sql, SqlScript): # Take the first statement and ignore the rest @@ -42,29 +42,6 @@ def __init__(self, sql): self.update_attributes_from_columns() self.update_columns_with_constrains() - def __str__(self): - """Output for the print statement of the table - """ - return self.sql_statement - - def copy(self): - """Create a copy of the Table object - """ - return deepcopy(self) - - def initialize_name(self): - """Parse the full name to declare the schema and table name - """ - split_name = self.full_name.split('.') - if len(split_name) == 2: - schema_name = split_name[0] - table_name = split_name[1] - else: - schema_name = None - table_name = self.full_name - - return schema_name, table_name - def update_attributes_from_columns(self): """ Update attributes sortkey and distkey based on columns """ diff --git a/dataduct/database/view.py b/dataduct/database/view.py index 44fa6cd..3ae7ff5 100644 --- a/dataduct/database/view.py +++ b/dataduct/database/view.py @@ -1,19 +1,18 @@ """Script containing the view class object """ -from copy import deepcopy - from .parsers import parse_create_view from .sql import SqlScript from .sql import SelectStatement +from .relation import Relation -class View(object): +class View(Relation): """Class representing view in the database """ def __init__(self, sql): """Constructor for view class """ - + super(View, self).__init__() if isinstance(sql, SqlScript): # Take the first statement and ignore the rest sql = SqlScript.statements[0] @@ -30,29 +29,6 @@ def __init__(self, sql): self.schema_name, self.view_name = self.initialize_name() - def __str__(self): - """Output for the print statement of the view - """ - return self.sql_statement - - def copy(self): - """Create a copy of the view object - """ - return deepcopy(self) - - def initialize_name(self): - """Parse the full name to declare the schema and view name - """ - split_name = self.full_name.split('.') - if len(split_name) == 2: - schema_name = split_name[0] - view_name = split_name[1] - else: - schema_name = None - view_name = self.view_name - - return schema_name, view_name - @property def dependencies(self): """List of relations which this view references. diff --git a/dataduct/utils/helpers.py b/dataduct/utils/helpers.py index 4162f49..49265af 100644 --- a/dataduct/utils/helpers.py +++ b/dataduct/utils/helpers.py @@ -21,6 +21,15 @@ def atmost_one(*args): return sum([1 for a in args if a is not None]) <= 1 +def atleast_one(*args): + """Asserts one of the arguments is not None + + Returns: + result(bool): True if atleast one of the arguments is not None + """ + return sum([1 for a in args if a is not None]) >= 1 + + def exactly_one(*args): """Asserts one of the arguments is not None From ff5f7aa410de8cc47a2473ba79d8508883af0ef6 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 12 Jan 2015 00:22:10 -0800 Subject: [PATCH 043/175] add database package --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 4e4424b..2d6fa2c 100644 --- a/setup.py +++ b/setup.py @@ -11,6 +11,7 @@ 'dataduct', 'dataduct.config', 'dataduct.data_access', + 'dataduct.database', 'dataduct.etl', 'dataduct.pipeline', 'dataduct.qa', From 364165dcaebbca0316605b63a3c55af03356050a Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 12 Jan 2015 10:56:58 -0800 Subject: [PATCH 044/175] Database Class --- .gitignore | 2 + dataduct/database/__init__.py | 6 +- dataduct/database/database.py | 106 ++++++++++++++++++++++ dataduct/database/parsers/create_table.py | 6 -- dataduct/database/parsers/create_view.py | 6 -- dataduct/database/relation.py | 4 +- dataduct/database/table.py | 2 +- dataduct/database/view.py | 2 +- 8 files changed, 115 insertions(+), 19 deletions(-) create mode 100644 dataduct/database/database.py diff --git a/.gitignore b/.gitignore index 3256ab0..3652bb7 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,5 @@ # Images created should be checked in manually *.png + +.coverage diff --git a/dataduct/database/__init__.py b/dataduct/database/__init__.py index df09213..d72e57a 100644 --- a/dataduct/database/__init__.py +++ b/dataduct/database/__init__.py @@ -1,4 +1,4 @@ -from .table import Table -from .sql import SqlStatement -from .sql import SqlScript from .sql import SelectStatement +from .sql import SqlScript +from .sql import SqlStatement +from .table import Table diff --git a/dataduct/database/database.py b/dataduct/database/database.py new file mode 100644 index 0000000..867dac5 --- /dev/null +++ b/dataduct/database/database.py @@ -0,0 +1,106 @@ +"""Script containing the database class object +""" +from copy import deepcopy +from .relation import Relation +from .view import View +from .table import Table + + +class Database(object): + """Class representing a database + """ + + def __init__(self, relations=None): + """Constructor for the database class + """ + self._relations = {} + + if relations: + for relation in relations: + self.add_relation(relation) + + def copy(self): + """Create a copy of the database object + """ + return deepcopy(self) + + def add_relation(self, relation): + """Add a relation, only if its name is not already used. + """ + assert isinstance(relation, Relation), 'Input should be a relation' + if relation.full_name in self._relations: + raise ValueError( + 'Relation %s already added to database' % relation.full_name) + + self._relations[relation.full_name] = relation + + def relations(self): + """Unsorted list of relations of the database + """ + return self._relations.values() + + def relation(self, relation_name): + """Get the relation with the given name + """ + return self._relations.get(relation_name, None) + + @property + def num_views(self): + """The number of views in the database + """ + return len([a for a in self.relations() if isinstance(a, View)]) + + @property + def num_tables(self): + """The number of tables in the database + """ + return len([a for a in self.relations() if isinstance(a, Table)]) + + def has_cycles(self, relation=None, visited=None): + """Check if the database has no circular dependencies + """ + if visited is None: + visited = list() + + if relation: + # Don't include table as own dependency, ignore references not in DB + relations_to_check = [ + self.relation(x) for x in relation.dependencies + if x != relation and self.relation(x) is not None] + else: + relations_to_check = self._relations.values() + + for relation in relations_to_check: + if relation.full_name in visited: + return True + # Make a copy for immutability + visited = deepcopy(visited) + visited.append(relation.full_name) + if self.has_cycles(relation, visited): + return True + return False + + def sorted_relations(self): + """Topological sort of the relations for dependency management + """ + if self.has_cycles(): + print 'Warning: database has cycles' + + sorted_relations = [] + graph = dict((x.full_name, x.dependencies) for x in self.relations()) + + # Run until the unsorted graph is empty + while graph: + acyclic = False + for relation_name, dependencies in graph.items(): + for dependency in dependencies: + if dependency in graph: + break + else: + acyclic = True + graph.pop(relation_name) + sorted_relations.append(self.relation(relation_name)) + + if not acyclic: + raise RuntimeError("A cyclic dependency occurred") + return sorted_relations diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index b07c8fb..5679742 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -4,8 +4,6 @@ from pyparsing import ParseException from pyparsing import ZeroOrMore -from ..sql import SqlStatement - from .utils import _all from .utils import _create from .utils import _db_name @@ -123,10 +121,6 @@ def parse_create_table(statement): Returns: table_data(dict): table_data dictionary for instantiating a table object """ - - if not isinstance(statement, SqlStatement): - raise ValueError('Input to table parser must of a SqlStatement object') - string = statement.sql() # Parse the base table definitions diff --git a/dataduct/database/parsers/create_view.py b/dataduct/database/parsers/create_view.py index 78879af..d191494 100644 --- a/dataduct/database/parsers/create_view.py +++ b/dataduct/database/parsers/create_view.py @@ -6,8 +6,6 @@ from pyparsing import Word from pyparsing import ZeroOrMore -from ..sql import SqlStatement - from .utils import _create from .utils import _view from .utils import _db_name @@ -29,10 +27,6 @@ def parse_create_view(statement): Returns: view_data(dict): view_data dictionary for instantiating a view object """ - - if not isinstance(statement, SqlStatement): - raise ValueError('Input to view parser must of a SqlStatement object') - string = statement.sql() end = ')' + StringEnd() diff --git a/dataduct/database/relation.py b/dataduct/database/relation.py index 4223008..f8066b6 100644 --- a/dataduct/database/relation.py +++ b/dataduct/database/relation.py @@ -76,9 +76,9 @@ def create_script(self, grant_permissions=True): script.append(self.grant_script()) return script - def recreate_script(self): + def recreate_script(self, grant_permissions=True): """Sql script to recreate the view """ script = self.drop_script() - script.append(self.create_script()) + script.append(self.create_script(grant_permissions)) return script diff --git a/dataduct/database/table.py b/dataduct/database/table.py index f7fc7fb..02c4ff2 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -120,7 +120,7 @@ def temporary_clone_script(self): def drop_script(self): """Sql script to drop the table """ - return SqlScript('DROP TABLE %s CASCADE' % self.full_name) + return SqlScript('DROP TABLE IF EXISTS %s CASCADE' % self.full_name) def analyze_script(self): """Sql script to analyze the table diff --git a/dataduct/database/view.py b/dataduct/database/view.py index 3ae7ff5..bec0fc1 100644 --- a/dataduct/database/view.py +++ b/dataduct/database/view.py @@ -44,4 +44,4 @@ def columns(self): def drop_script(self): """Sql script to drop the view """ - return SqlScript('DROP VIEW %s CASCADE' % self.full_name) + return SqlScript('DROP VIEW IF EXISTS %s CASCADE' % self.full_name) From aefb2a9bb51e1e34cb8cb9b6954c703bffe0a602 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 12 Jan 2015 14:57:03 -0800 Subject: [PATCH 045/175] Database Actions --- dataduct/database/__init__.py | 2 ++ dataduct/database/database.py | 53 +++++++++++++++++++++++++++++++++++ dataduct/database/table.py | 15 ++++++++++ 3 files changed, 70 insertions(+) diff --git a/dataduct/database/__init__.py b/dataduct/database/__init__.py index d72e57a..017c83f 100644 --- a/dataduct/database/__init__.py +++ b/dataduct/database/__init__.py @@ -2,3 +2,5 @@ from .sql import SqlScript from .sql import SqlStatement from .table import Table +from .view import View +from .database import Database diff --git a/dataduct/database/database.py b/dataduct/database/database.py index 867dac5..2e8cc84 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -4,6 +4,7 @@ from .relation import Relation from .view import View from .table import Table +from .sql import SqlScript class Database(object): @@ -104,3 +105,55 @@ def sorted_relations(self): if not acyclic: raise RuntimeError("A cyclic dependency occurred") return sorted_relations + + def relations_script(self, function_name, **kwargs): + """SQL Script for all the relations of the database + """ + result = SqlScript() + for relation in self.sorted_relations(): + func = getattr(relation, function_name) + result.append(func(**kwargs)) + return result + + def grant_relations_script(self): + """SQL Script for granting permissions all the relations of the database + """ + return self.relations_script('grant_script') + + def create_relations_script(self, grant_permissions=True): + """SQL Script for creating all the relations of the database + """ + return self.relations_script( + 'create_script', grant_permissions=grant_permissions) + + def recreate_relations_script(self, grant_permissions=True): + """SQL Script for recreating all the relations of the database + """ + return self.relations_script( + 'recreate_script', grant_permissions=grant_permissions) + + def recreate_table_dependencies(self, table_name): + """Recreate the dependencies for a particular table from the database + """ + result = SqlScript() + for relation in self.relations(): + if relation.full_name == table_name: + # Continue as cannnot be dependecy of self + continue + + if isinstance(relation, Table): + # Recreate foreign key relations + for column_names, ref_name, ref_columns in \ + relation.forign_key_references(): + if ref_name == table_name: + result.append( + relation.foreign_key_reference_script( + source_columns=column_names, + reference_name=ref_name, + reference_columns=ref_columns)) + + if isinstance(relation, View): + # Recreate view if pointing to table + if table_name in relation.dependencies: + result.append(relation.recreate_script()) + return result diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 02c4ff2..1efde19 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -126,3 +126,18 @@ def analyze_script(self): """Sql script to analyze the table """ return SqlScript('ANALYZE %s' % self.full_name) + + def foreign_key_reference_script(self, source_columns, reference_name, + reference_columns): + """Sql Script to create a FK reference from table x to y + """ + sql = """ + ALTER TABLE {source_name} + ADD FOREIGN KEY ({source_columns}) + REFERENCES {reference_name} ({reference_columns}) + """.format(source_name=self.full_name, + source_columns=', '.join(source_columns), + reference_name=reference_name, + reference_columns=', '.join(reference_columns)) + + return SqlScript(sql) From bd62485e067feb2452af752152eb62f3f63c91c1 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 13 Jan 2015 16:35:19 -0800 Subject: [PATCH 046/175] Table manipulation scripts --- dataduct/database/__init__.py | 4 +- dataduct/database/column.py | 4 +- dataduct/database/parsers/__init__.py | 1 + dataduct/database/parsers/select_query.py | 26 ++- .../database/{sql => }/select_statement.py | 12 +- dataduct/database/sql/__init__.py | 1 - dataduct/database/sql/sql_statement.py | 21 +++ dataduct/database/table.py | 151 +++++++++++++++++- dataduct/database/view.py | 2 +- 9 files changed, 208 insertions(+), 14 deletions(-) rename dataduct/database/{sql => }/select_statement.py (65%) diff --git a/dataduct/database/__init__.py b/dataduct/database/__init__.py index 017c83f..7710f0e 100644 --- a/dataduct/database/__init__.py +++ b/dataduct/database/__init__.py @@ -1,6 +1,6 @@ -from .sql import SelectStatement +from .database import Database +from .select_statement import SelectStatement from .sql import SqlScript from .sql import SqlStatement from .table import Table from .view import View -from .database import Database diff --git a/dataduct/database/column.py b/dataduct/database/column.py index fc6bb56..948c099 100644 --- a/dataduct/database/column.py +++ b/dataduct/database/column.py @@ -33,7 +33,9 @@ def __init__(self, column_name, column_type, encoding=None, def __str__(self): """String output for the columns """ - return ' '.join(self.column_name, self.column_type) + if self.column_type is not None: + return '%s %s' % (self.column_name, self.column_type) + return self.column_name @property def primary(self): diff --git a/dataduct/database/parsers/__init__.py b/dataduct/database/parsers/__init__.py index aa967b7..19119bb 100644 --- a/dataduct/database/parsers/__init__.py +++ b/dataduct/database/parsers/__init__.py @@ -6,6 +6,7 @@ from .select_query import parse_select_dependencies from .select_query import parse_select_columns +from .select_query import parse_column_name from .create_table import parse_create_table from .create_view import parse_create_view diff --git a/dataduct/database/parsers/select_query.py b/dataduct/database/parsers/select_query.py index ba14012..192a588 100644 --- a/dataduct/database/parsers/select_query.py +++ b/dataduct/database/parsers/select_query.py @@ -1,8 +1,10 @@ """Select SQL parser """ -from pyparsing import restOfLine -from pyparsing import MatchFirst from pyparsing import delimitedList +from pyparsing import MatchFirst +from pyparsing import printables +from pyparsing import restOfLine +from pyparsing import Word from pyparsing import WordStart from .utils import _db_name @@ -80,3 +82,23 @@ def parse_select_columns(statement): # Strip extra whitespace from the string return [column.strip() for column in output] + + +def parse_column_name(string): + """Parse column name from select query + + Note: + This assumes that every column has a name and is the last word of str + + Args: + string(str): Input string to be parsed + + Returns: + result(str): column name + """ + # Find all words in the string + words = Word(printables.replace('\n\r', '')).searchString(string) + + # Get the last word matched + name = words.pop().asList().pop() + return name diff --git a/dataduct/database/sql/select_statement.py b/dataduct/database/select_statement.py similarity index 65% rename from dataduct/database/sql/select_statement.py rename to dataduct/database/select_statement.py index c11e539..32ee3d0 100644 --- a/dataduct/database/sql/select_statement.py +++ b/dataduct/database/select_statement.py @@ -1,9 +1,11 @@ """Script containing the SelectStatement object """ -from .sql_statement import SqlStatement -from ..parsers import parse_select_dependencies -from ..parsers import parse_select_columns +from .sql import SqlStatement +from .column import Column +from .parsers import parse_select_dependencies +from .parsers import parse_select_columns +from .parsers import parse_column_name class SelectStatement(SqlStatement): @@ -15,7 +17,9 @@ def __init__(self, sql): super(SelectStatement, self).__init__(sql) self._dependencies = parse_select_dependencies(self.sql()) - self._columns = parse_select_columns(self.sql()) + self._raw_columns = parse_select_columns(self.sql()) + self._columns = [ + Column(parse_column_name(c), None) for c in self._raw_columns] @property def dependencies(self): diff --git a/dataduct/database/sql/__init__.py b/dataduct/database/sql/__init__.py index 24a8cdf..ef98ecc 100644 --- a/dataduct/database/sql/__init__.py +++ b/dataduct/database/sql/__init__.py @@ -1,3 +1,2 @@ from .sql_statement import SqlStatement from .sql_script import SqlScript -from .select_statement import SelectStatement diff --git a/dataduct/database/sql/sql_statement.py b/dataduct/database/sql/sql_statement.py index 61bdcb4..208ed02 100644 --- a/dataduct/database/sql/sql_statement.py +++ b/dataduct/database/sql/sql_statement.py @@ -2,6 +2,8 @@ """ from copy import deepcopy from .utils import sanatize_sql +from ..parsers import parse_create_table +from ..parsers import parse_create_view class SqlStatement(object): @@ -45,3 +47,22 @@ def _sanatize_sql(self): return raw_statements[0] else: return '' + + def _validate_parser(self, func): + """Check if a parser satisfies the sql statement + """ + try: + func(self.sql()) + except Exception: + return False + return True + + def creates_table(self): + """SQL statement creates a table. + """ + return self._validate_parser(parse_create_table) + + def creates_view(self): + """SQL statement creates a view. + """ + return self._validate_parser(parse_create_view) diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 1efde19..f391347 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -2,10 +2,17 @@ """ from .parsers import parse_create_table from .sql import SqlScript +from .select_statement import SelectStatement from .column import Column from .relation import Relation +def comma_seperated(elements): + """Create a comma separated string from the iterator + """ + return ','.join(elements) + + class Table(Relation): """Class representing tables in the database """ @@ -76,6 +83,12 @@ def primary_keys(self): """ return [c for c in self.columns if c.primary] + @property + def primary_key_names(self): + """Primary keys of the table + """ + return [c.name for c in self.columns if c.primary] + def forign_key_references(self): """Get a list of all foreign key references from the table """ @@ -109,7 +122,7 @@ def temporary_clone_script(self): table_name = self.table_name + '_temp' # Create a list of column definitions - columns = ', '.join( + columns = comma_seperated( ['%s %s' %(c.column_name, c.column_type) for c in self.columns]) # We don't need any constraints to be specified on the temp table @@ -127,6 +140,17 @@ def analyze_script(self): """ return SqlScript('ANALYZE %s' % self.full_name) + def rename_script(self, new_name): + """Sql script to rename the table + """ + return SqlScript( + 'ALTER TABLE %s RENAME TO %s' %(self.full_name, new_name)) + + def delete_script(self, where_condition=''): + """Sql script to delete from table based on where condition + """ + return SqlScript('DELETE FROM %s %s' %(self.full_name, where_condition)) + def foreign_key_reference_script(self, source_columns, reference_name, reference_columns): """Sql Script to create a FK reference from table x to y @@ -136,8 +160,129 @@ def foreign_key_reference_script(self, source_columns, reference_name, ADD FOREIGN KEY ({source_columns}) REFERENCES {reference_name} ({reference_columns}) """.format(source_name=self.full_name, - source_columns=', '.join(source_columns), + source_columns=comma_seperated(source_columns), reference_name=reference_name, - reference_columns=', '.join(reference_columns)) + reference_columns=comma_seperated(reference_columns)) + + return SqlScript(sql) + + def select_duplicates_script(self): + """Sql Script to select duplicate primary keys from the table + """ + pk_columns = comma_seperated(self.primary_key_names) + sql = """ + SELECT {pk_columns} + ,COUNT(1) duplicate_count + FROM {table_name} + GROUP BY {pk_columns} + HAVING COUNT(1) > 1 + """.format(table_name=self.full_name, + pk_columns=pk_columns) + + return SqlScript(sql) + + def _source_sql(self, source_relation): + """Get the source sql based on the type of the source specified + """ + if not (isinstance(source_relation, Relation) or \ + isinstance(source_relation, SelectStatement)): + raise ValueError('Source Relation must be a relation or select') + + if len(self.columns) < len(source_relation.columns): + raise ValueError('Source has more columns than destination') + if isinstance(source_relation, SelectStatement): + source_sql = '(' + source_relation.sql() + ')' + else: + source_sql = source_relation.full_name + + return source_sql + + def insert_script(self, source_relation): + """Sql Script to insert into the table while avoiding PK violations + """ + sql = 'INSERT INTO %s (SELECT * FROM %s)' %( + self.full_name, self._source_sql(source_relation)) return SqlScript(sql) + + def delete_matching_rows_script(self, source_relation): + """Sql Script to delete matching rows between table and source + """ + if len(self.primary_keys) == 0: + raise RuntimeError( + 'Cannot delete matching rows from table with no primary keys') + + source_col_names, pk_names = [], [] + source_columns = source_relation.columns + for i, column in enumerate(self.columns): + if column.primary: + pk_names.append(column.name) + source_col_names.append(source_columns[i].name) + + where_condition = 'WHERE (%s) IN (SELECT DISTINCT %s FROM %s)' % ( + comma_seperated(pk_names), comma_seperated(source_col_names), + self._source_sql(source_relation)) + + return self.delete_script(where_condition) + + def de_duplication_script(self): + """De-duplicate the table to enforce primary keys + """ + if len(self.primary_keys) == 0: + raise RuntimeError( + 'Cannot de-duplicate table with no primary keys') + + script = self.temporary_clone_script() + column_names = [c.name for c in self.columns] + + # Create a temporary clone from the script + temp_table = self.__class__(script) + script.append(temp_table.insert_script(self)) + script.append(self.delete_script) + + # Pick a random value on multiple primary keys + sql = """ + INSERT INTO {table_name} ( + SELECT {column_names} + FROM ( + SELECT *, + COUNT(1) OVER ( + PARTITION BY {pk_names} + ORDER BY 1 ROWS UNBOUNDED PRECEDING) rnk + FROM {temp_table}) + WHERE rnk = 1) + """.format(table_name=self.full_name, + column_names=comma_seperated(column_names), + pk_names=self.primary_key_names, + temp_table=temp_table.full_name) + + script.append(SqlScript(sql)) + return script + + def upsert_script(self, source_relation, enforce_primary_key=True, + delete_existing=False): + """Sql script to upsert into a table + + The script first copies all the source data into a temporary table. + Then if the enforce_primary_key flag is set we de-duplicate the temp + table. After which if the delete existing flag is set we delete all + the data from the destination table otherwise only the rows that match + the temporary table. After which we copy the temporary table into the + destination table. + """ + script = self.temporary_clone_script() + + # Create a temporary clone from the script + temp_table = self.__class__(script) + script.append(temp_table.insert_script(source_relation)) + if enforce_primary_key: + script.append(temp_table.de_duplication_script()) + + if delete_existing: + script.append(self.delete_script()) + else: + script.append(self.delete_matching_rows_script(temp_table)) + + script.append(self.insert_script(temp_table)) + script.append(temp_table.drop_script()) + return script diff --git a/dataduct/database/view.py b/dataduct/database/view.py index bec0fc1..94680f9 100644 --- a/dataduct/database/view.py +++ b/dataduct/database/view.py @@ -2,7 +2,7 @@ """ from .parsers import parse_create_view from .sql import SqlScript -from .sql import SelectStatement +from .select_statement import SelectStatement from .relation import Relation From 5d62030f1ec52de8693852fc505d5fed2164a8b6 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 13 Jan 2015 17:25:16 -0800 Subject: [PATCH 047/175] create database from files --- dataduct/database/database.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/dataduct/database/database.py b/dataduct/database/database.py index 2e8cc84..6a0b36a 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -1,21 +1,31 @@ """Script containing the database class object """ from copy import deepcopy + from .relation import Relation from .view import View from .table import Table from .sql import SqlScript +from ..utils.helpers import atmost_one +from ..utils.helpers import parse_path + class Database(object): """Class representing a database """ - def __init__(self, relations=None): + def __init__(self, relations=None, files=None): """Constructor for the database class """ self._relations = {} + if not atmost_one(relations, files): + raise ValueError('Only one of relations and files should be given') + + if files: + relations = self._initialize_relations(files) + if relations: for relation in relations: self.add_relation(relation) @@ -25,6 +35,22 @@ def copy(self): """ return deepcopy(self) + @staticmethod + def _initialize_relations(files): + """Read the files and create relations from the files + """ + relations = [] + for filename in files: + with open(parse_path(filename)) as f: + script = SqlScript(f.read()) + if script.creates_table(): + relations.append(Table(script)) + elif script.creates_view(): + relations.append(View(script)) + else: + raise ValueError('File does not create a relation') + return relations + def add_relation(self, relation): """Add a relation, only if its name is not already used. """ From 0abe7b093f1dd2e1959c1f8e5a53af0a3419cbfc Mon Sep 17 00:00:00 2001 From: sb2nov Date: Wed, 14 Jan 2015 01:59:48 -0800 Subject: [PATCH 048/175] Share constants across config --- dataduct/config/config.py | 23 ++++++++++++++--------- dataduct/config/config_actions.py | 10 +++++----- dataduct/config/constants.py | 6 ++++++ 3 files changed, 25 insertions(+), 14 deletions(-) create mode 100644 dataduct/config/constants.py diff --git a/dataduct/config/config.py b/dataduct/config/config.py index dafc336..7edce25 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -1,8 +1,13 @@ """Module that maintains the config singleton object used across the package """ -import os +from os.path import expanduser +from os.path import join +from os import environ import yaml +from .constants import DATADUCT_CFG_FILE +from .constants import DATADUCT_DIR + def get_config_files(): """Get the config file for dataduct @@ -11,17 +16,17 @@ def get_config_files(): The order of precedence is: 1. /etc/dataduct.cfg 2. ~/.dataduct - 3. DATADUCT_PATH environment variable + 3. DATADUCT_CONFIG_PATH environment variable """ - dataduct_config_path = '/etc/dataduct.cfg' - dataduct_user_config_path = os.path.join(os.path.expanduser('~'), - '.dataduct') + dataduct_config_path = join('/etc', DATADUCT_CFG_FILE) + dataduct_user_config_path = join(expanduser('~'), DATADUCT_DIR, + DATADUCT_CFG_FILE) config_files = [dataduct_config_path, dataduct_user_config_path] - # Check DATADUCT_PATH env variable for other configuration locations - if 'DATADUCT_PATH' in os.environ: - for path in os.environ['DATADUCT_PATH'].split(":"): - config_files.append(os.path.expanduser(path)) + # Check DATADUCT_CONFIG_PATH env variable for other configuration locations + if 'DATADUCT_CONFIG_PATH' in environ: + for path in environ['DATADUCT_CONFIG_PATH'].split(":"): + config_files.append(expanduser(path)) return config_files diff --git a/dataduct/config/config_actions.py b/dataduct/config/config_actions.py index b3df162..9b9540d 100644 --- a/dataduct/config/config_actions.py +++ b/dataduct/config/config_actions.py @@ -2,19 +2,19 @@ Script that has action functions for config """ from .config import Config - from ..s3 import S3Path from ..s3 import S3File -config = Config() -CONFIG_STR = 'config' -DATADUCT_FILE_NAME = 'dataduct.cfg' +from .constants import CONFIG_STR +from .constants import DATADUCT_CFG_FILE +config = Config() + def s3_config_path(): """S3 uri for the config files """ - key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, DATADUCT_FILE_NAME] + key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, DATADUCT_CFG_FILE] return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) diff --git a/dataduct/config/constants.py b/dataduct/config/constants.py new file mode 100644 index 0000000..25efd49 --- /dev/null +++ b/dataduct/config/constants.py @@ -0,0 +1,6 @@ +"""Constants shared across the config package +""" + +CONFIG_STR = 'config' +DATADUCT_CFG_FILE = 'dataduct.cfg' +DATADUCT_DIR = '.dataduct' From 662bac88f76c83c414960b7c602ee44f356c6a2b Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 15 Jan 2015 14:32:11 -0800 Subject: [PATCH 049/175] Added travis dependency tests --- .travis.yml | 1 + dataduct/tests/__init__.py | 1 + dataduct/tests/test_import.py | 70 +++++++++++++++++++++++++++++++++++ requirements.txt | 6 +++ 4 files changed, 78 insertions(+) create mode 100644 dataduct/tests/__init__.py create mode 100644 dataduct/tests/test_import.py diff --git a/.travis.yml b/.travis.yml index 19a9de5..779bcb9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,7 @@ language: python # command to install dependencies install: + - sudo apt-get install graphviz - pip install -r requirements.txt # command to run tests diff --git a/dataduct/tests/__init__.py b/dataduct/tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/dataduct/tests/__init__.py @@ -0,0 +1 @@ + diff --git a/dataduct/tests/test_import.py b/dataduct/tests/test_import.py new file mode 100644 index 0000000..4146bb4 --- /dev/null +++ b/dataduct/tests/test_import.py @@ -0,0 +1,70 @@ +"""Tests for dependencies +""" +from unittest import TestCase + +class TestImports(TestCase): + """Tests for dependencies + """ + @staticmethod + def test_boto(): + """Testing boto + """ + print 'Trying to import boto' + import boto + + @staticmethod + def test_mysqldb(): + """Testing MySQLdb + """ + print 'Trying to import MySQLdb' + import MySQLdb + + @staticmethod + def test_pandas(): + """Testing pandas + """ + print 'Trying to import pandas' + import pandas + print pandas.io.sql + + @staticmethod + def test_psycopg2(): + """Testing psycopg2 + """ + print 'Trying to import psycopg2' + import psycopg2 + + @staticmethod + def test_pygraphviz(): + """Testing pygraphviz + """ + print 'Trying to import pygraphviz' + import pygraphviz + + @staticmethod + def test_pyparsing(): + """Testing pyparsing + """ + print 'Trying to import pyparsing' + import pyparsing + + @staticmethod + def test_pyyaml(): + """Testing PyYAML + """ + print 'Trying to import pyyaml' + import yaml + + @staticmethod + def test_setuptools(): + """Testing setuptools + """ + print 'Trying to import setuptools' + import setuptools + + @staticmethod + def test_sphinx_rtd_theme(): + """Testing sphinx_rtd_theme + """ + print 'Trying to import sphinx_rtd_theme' + import sphinx_rtd_theme diff --git a/requirements.txt b/requirements.txt index 163703d..edaa036 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,9 @@ Sphinx>=1.2.3 sphinx-rtd-theme>=0.1.6 sphinxcontrib-napoleon>=0.2.8 pandas>=0.14.1 +psycopg2 +MySQL-python +PyYAML +coverage +pyparsing>=2 +pygraphviz From bf2949dbc6cdf0accd292870d15ea33c00caaf36 Mon Sep 17 00:00:00 2001 From: sb2nov Date: Thu, 15 Jan 2015 15:31:51 -0800 Subject: [PATCH 050/175] Logging configuration across the package --- dataduct/config/config.py | 10 ++++---- dataduct/config/config_actions.py | 4 ++-- dataduct/config/constants.py | 5 ++-- dataduct/config/logger_config.py | 40 +++++++++++++++++++++++++++++++ setup.py | 2 +- 5 files changed, 51 insertions(+), 10 deletions(-) create mode 100644 dataduct/config/logger_config.py diff --git a/dataduct/config/config.py b/dataduct/config/config.py index 7edce25..824ac8c 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -5,8 +5,8 @@ from os import environ import yaml -from .constants import DATADUCT_CFG_FILE -from .constants import DATADUCT_DIR +from .constants import CFG_FILE +from .constants import CONFIG_DIR def get_config_files(): @@ -18,9 +18,9 @@ def get_config_files(): 2. ~/.dataduct 3. DATADUCT_CONFIG_PATH environment variable """ - dataduct_config_path = join('/etc', DATADUCT_CFG_FILE) - dataduct_user_config_path = join(expanduser('~'), DATADUCT_DIR, - DATADUCT_CFG_FILE) + dataduct_config_path = join('/etc', CFG_FILE) + dataduct_user_config_path = join(expanduser('~'), CONFIG_DIR, + CFG_FILE) config_files = [dataduct_config_path, dataduct_user_config_path] # Check DATADUCT_CONFIG_PATH env variable for other configuration locations diff --git a/dataduct/config/config_actions.py b/dataduct/config/config_actions.py index 9b9540d..709c236 100644 --- a/dataduct/config/config_actions.py +++ b/dataduct/config/config_actions.py @@ -6,7 +6,7 @@ from ..s3 import S3File from .constants import CONFIG_STR -from .constants import DATADUCT_CFG_FILE +from .constants import CFG_FILE config = Config() @@ -14,7 +14,7 @@ def s3_config_path(): """S3 uri for the config files """ - key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, DATADUCT_CFG_FILE] + key = [config.etl.get('S3_BASE_PATH', ''), CONFIG_STR, CFG_FILE] return S3Path(bucket=config.etl['S3_ETL_BUCKET'], key=key) diff --git a/dataduct/config/constants.py b/dataduct/config/constants.py index 25efd49..7e44208 100644 --- a/dataduct/config/constants.py +++ b/dataduct/config/constants.py @@ -2,5 +2,6 @@ """ CONFIG_STR = 'config' -DATADUCT_CFG_FILE = 'dataduct.cfg' -DATADUCT_DIR = '.dataduct' +CONFIG_DIR = '.dataduct' +CFG_FILE = 'dataduct.cfg' +LOG_FILE = 'dataduct.log' diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py new file mode 100644 index 0000000..b221ef5 --- /dev/null +++ b/dataduct/config/logger_config.py @@ -0,0 +1,40 @@ +""" +Script that has the base logger configurations +""" +import os +import logging +from logging.handlers import RotatingFileHandler +from logging import StreamHandler + +from .config import Config +from .constants import CONFIG_DIR +from .constants import LOG_FILE + +DATE_FMT = '%m-%d %H:%M' +LOG_FMT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' + +config = Config() + +def logger_configuration(): + """Set the logger configurations for dataduct + """ + if not os.path.exists(CONFIG_DIR): + os.makedir(CONFIG_DIR) + + log_directory = os.path.join(os.path.expanduser(CONFIG_DIR)) + file_name = LOG_FILE + if hasattr(config, 'logging') and 'LOG_DIR' in config.logging: + log_directory = config.logging.get('LOG_DIR') + file_name = config.logging.get('LOG_FILE') + + logging.basicConfig(level=logging.DEBUG, + format=LOG_FMT, + datefmt=DATE_FMT) + + file_handler = RotatingFileHandler(os.path.join(log_directory, file_name), + maxBytes=200000, backupCount=10) + console_handler = StreamHandler() + console_handler.setLevel(logging.WARNING) + + logging.getLogger('').addHandler(file_handler) + logging.getLogger('').addHandler(console_handler) diff --git a/setup.py b/setup.py index 2d6fa2c..896c1c9 100644 --- a/setup.py +++ b/setup.py @@ -27,7 +27,7 @@ license='Apache License 2.0', description='DataPipeline for Humans', install_requires=[ - 'boto>=2.32', + 'boto>=2.34', 'PyYAML', 'pandas', 'psycopg2', From 36fd9b7c1364f7e218c65719a50178530a975d01 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 15 Jan 2015 16:18:43 -0800 Subject: [PATCH 051/175] Fragmented dataduct into multiple subparsers --- bin/dataduct | 117 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 84 insertions(+), 33 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index ccc0f77..69c60d8 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -11,9 +11,13 @@ from dataduct.config import Config CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' -VISUALIZE_STR = 'visualize' -SYNC_CONFIG_TO_S3 = 'sync_config_to_s3' -SYNC_CONFIG_FROM_S3 = 'sync_config_from_s3' + +CONFIG_TO_S3 = 'sync_to_s3' +CONFIG_FROM_S3 = 'sync_from_s3' + +CONFIG_COMMAND = 'config' +PIPELINE_COMMAND = 'pipeline' +VISUALIZE_COMMAND = 'visualize' def config_actions(action, filename): @@ -22,84 +26,129 @@ def config_actions(action, filename): from dataduct.config.config_actions import sync_to_s3 from dataduct.config.config_actions import sync_from_s3 - if action == SYNC_CONFIG_TO_S3: + if action == CONFIG_TO_S3: return sync_to_s3() - if action == SYNC_CONFIG_FROM_S3: + if action == CONFIG_FROM_S3: return sync_from_s3(filename) -def pipeline_actions(action, load_definitions, force_overwrite, filename, - delay): +def pipeline_actions(action, load_definitions, force_overwrite, delay): """Pipeline related actions are executed in this block """ from dataduct.etl import activate_pipeline from dataduct.etl import create_pipeline from dataduct.etl import read_pipeline_definition from dataduct.etl import validate_pipeline - from dataduct.etl import visualize_pipeline for load_definition in load_definitions: definition = read_pipeline_definition(load_definition) definition.update({'delay': delay}) etl = create_pipeline(definition) - if action in [VISUALIZE_STR]: - visualize_pipeline(etl, filename) if action in [VALIDATE_STR, ACTIVATE_STR]: validate_pipeline(etl, force_overwrite) if action == ACTIVATE_STR: activate_pipeline(etl) +def visualize_actions(load_definitions, filename): + """Visualization actions are executed in this block + """ + from dataduct.etl import create_pipeline + from dataduct.etl import read_pipeline_definition + from dataduct.etl import visualize_pipeline + + for load_definition in load_definitions: + definition = read_pipeline_definition(load_definition) + + etl = create_pipeline(definition) + visualize_pipeline(etl, filename) + + def main(): """Main function""" parser = argparse.ArgumentParser(description='Run Dataduct commands') + parser.add_argument( - '-a', - '--action', + '-m', + '--mode', + default=None, + help='Mode to run the pipeline and config overrides to use', + ) + + subparsers = parser.add_subparsers(help='Commands', dest='command') + + # Config parser declaration + config_parser = subparsers.add_parser(CONFIG_COMMAND) + + config_parser.add_argument( + 'action', + type=str, + choices={ + CONFIG_TO_S3: 'sync config file from local to s3', + CONFIG_FROM_S3: 'sync config file from s3 to local file', + }, + default=CONFIG_FROM_S3, + ) + + config_parser.add_argument( + '-f', + '--filename', + default=None, + help='Filename to sync', + ) + + # Pipeline parser declaration + pipeline_parser = subparsers.add_parser(PIPELINE_COMMAND) + + pipeline_parser.add_argument( + 'action', type=str, choices={ CREATE_STR: 'Create a pipeline locally', VALIDATE_STR: 'Validate a pipeline with AWS without activating', ACTIVATE_STR: 'create a pipeline and activate it on AWS', - VISUALIZE_STR: 'visualize a pipeline', - SYNC_CONFIG_TO_S3: 'sync config file from local to s3', - SYNC_CONFIG_FROM_S3: 'sync config file from s3 to local file', }, default=CREATE_STR, ) - parser.add_argument( + + pipeline_parser.add_argument( 'load_definitions', nargs='*', help='Enter the paths of the load definitions', ) - parser.add_argument( + + pipeline_parser.add_argument( '-f', '--force_overwrite', action='store_true', default=False, help='Indicates that if this pipeline exists, it will be destroyed', ) - parser.add_argument( - '-m', - '--mode', - default=None, - help='Mode to run the pipeline and config overrides to use', - ) - parser.add_argument( + + pipeline_parser.add_argument( '-d', '--delay', default=0, type=int, help='Delay the pipeline by x days', ) - parser.add_argument( - '-F', - '--filename', - default=None, - help='Filename for various actions', + + # Visualize parser declaration + visualize_parser = subparsers.add_parser(VISUALIZE_COMMAND) + + visualize_parser.add_argument( + 'filename', + help='Filename for the graph', + ) + + visualize_parser.add_argument( + 'load_definitions', + nargs='*', + help='Enter the paths of the load definitions', ) + args = parser.parse_args() mode = args.mode @@ -112,13 +161,15 @@ def main(): # As this is the single entry point to the library # We can use the __new__ function to set the debug_level config = Config(mode=mode) - print 'Running the pipeline in %s mode.' %config.mode + print 'Running the pipeline in %s mode.' % config.mode - if args.action in [SYNC_CONFIG_TO_S3, SYNC_CONFIG_FROM_S3]: + if args.command == CONFIG_COMMAND: config_actions(args.action, args.filename) - else: + elif args.command == PIPELINE_COMMAND: pipeline_actions(args.action, args.load_definitions, - args.force_overwrite, args.filename, args.delay) + args.force_overwrite, args.delay) + else: + visualize_actions(args.load_definitions, args.filename) if __name__ == '__main__': From f0444ad5179bc09b655a8d44b7c0e98ea4907e3c Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 15 Jan 2015 17:42:31 -0800 Subject: [PATCH 052/175] Code review changes --- bin/dataduct | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index 69c60d8..3c0776a 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -28,24 +28,31 @@ def config_actions(action, filename): if action == CONFIG_TO_S3: return sync_to_s3() + return sync_from_s3(filename) - if action == CONFIG_FROM_S3: - return sync_from_s3(filename) - -def pipeline_actions(action, load_definitions, force_overwrite, delay): - """Pipeline related actions are executed in this block +def initialize_etl_objects(load_definitions, delay=None): + """Generate etl objects from yaml files """ - from dataduct.etl import activate_pipeline from dataduct.etl import create_pipeline from dataduct.etl import read_pipeline_definition - from dataduct.etl import validate_pipeline + etls = [] for load_definition in load_definitions: definition = read_pipeline_definition(load_definition) - definition.update({'delay': delay}) + if delay is not None: + definition.update({'delay': delay}) + etls.append(create_pipeline(definition)) + return etls + + +def pipeline_actions(action, load_definitions, force_overwrite, delay): + """Pipeline related actions are executed in this block + """ + from dataduct.etl import activate_pipeline + from dataduct.etl import validate_pipeline - etl = create_pipeline(definition) + for etl in initialize_etl_objects(load_definitions, delay): if action in [VALIDATE_STR, ACTIVATE_STR]: validate_pipeline(etl, force_overwrite) if action == ACTIVATE_STR: @@ -55,33 +62,25 @@ def pipeline_actions(action, load_definitions, force_overwrite, delay): def visualize_actions(load_definitions, filename): """Visualization actions are executed in this block """ - from dataduct.etl import create_pipeline - from dataduct.etl import read_pipeline_definition from dataduct.etl import visualize_pipeline - for load_definition in load_definitions: - definition = read_pipeline_definition(load_definition) - - etl = create_pipeline(definition) + for etl in initialize_etl_objects(load_definitions): visualize_pipeline(etl, filename) def main(): """Main function""" parser = argparse.ArgumentParser(description='Run Dataduct commands') - parser.add_argument( '-m', '--mode', default=None, help='Mode to run the pipeline and config overrides to use', ) - subparsers = parser.add_subparsers(help='Commands', dest='command') # Config parser declaration config_parser = subparsers.add_parser(CONFIG_COMMAND) - config_parser.add_argument( 'action', type=str, @@ -91,7 +90,6 @@ def main(): }, default=CONFIG_FROM_S3, ) - config_parser.add_argument( '-f', '--filename', @@ -101,7 +99,6 @@ def main(): # Pipeline parser declaration pipeline_parser = subparsers.add_parser(PIPELINE_COMMAND) - pipeline_parser.add_argument( 'action', type=str, @@ -112,13 +109,11 @@ def main(): }, default=CREATE_STR, ) - pipeline_parser.add_argument( 'load_definitions', - nargs='*', + nargs='+', help='Enter the paths of the load definitions', ) - pipeline_parser.add_argument( '-f', '--force_overwrite', @@ -126,7 +121,6 @@ def main(): default=False, help='Indicates that if this pipeline exists, it will be destroyed', ) - pipeline_parser.add_argument( '-d', '--delay', @@ -137,15 +131,13 @@ def main(): # Visualize parser declaration visualize_parser = subparsers.add_parser(VISUALIZE_COMMAND) - visualize_parser.add_argument( 'filename', help='Filename for the graph', ) - visualize_parser.add_argument( 'load_definitions', - nargs='*', + nargs='+', help='Enter the paths of the load definitions', ) From 518c99260c2d9606c13038023af2bac9453010d3 Mon Sep 17 00:00:00 2001 From: sb2nov Date: Thu, 15 Jan 2015 17:50:55 -0800 Subject: [PATCH 053/175] Multiple handlers --- bin/dataduct | 4 +++ dataduct/config/__init__.py | 1 + dataduct/config/logger_config.py | 46 ++++++++++++++++++-------------- dataduct/etl/etl_actions.py | 5 +++- 4 files changed, 35 insertions(+), 21 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index ccc0f77..8520a17 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -7,6 +7,7 @@ Script that helps create and validate pipelines from command line import argparse from dataduct.config import Config +from dataduct.config import logger_configuration CREATE_STR = 'create' VALIDATE_STR = 'validate' @@ -114,6 +115,9 @@ def main(): config = Config(mode=mode) print 'Running the pipeline in %s mode.' %config.mode + # Setup up logging for package + logger_configuration() + if args.action in [SYNC_CONFIG_TO_S3, SYNC_CONFIG_FROM_S3]: config_actions(args.action, args.filename) else: diff --git a/dataduct/config/__init__.py b/dataduct/config/__init__.py index cca5d9b..dd24350 100644 --- a/dataduct/config/__init__.py +++ b/dataduct/config/__init__.py @@ -1 +1,2 @@ from .config import Config +from .logger_config import logger_configuration diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py index b221ef5..04a5a4d 100644 --- a/dataduct/config/logger_config.py +++ b/dataduct/config/logger_config.py @@ -4,37 +4,43 @@ import os import logging from logging.handlers import RotatingFileHandler -from logging import StreamHandler from .config import Config from .constants import CONFIG_DIR from .constants import LOG_FILE -DATE_FMT = '%m-%d %H:%M' -LOG_FMT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' +FILE_FORMAT_STR = '%(asctime)s [%(levelname)s]: %(message)s ' + \ + '[in %(module)s:%(lineno)d in %(funcName)s]' +CONSOLE_FORMAT_STR = '[%(levelname)s]: %(message)s' -config = Config() def logger_configuration(): """Set the logger configurations for dataduct """ - if not os.path.exists(CONFIG_DIR): - os.makedir(CONFIG_DIR) + config = Config() - log_directory = os.path.join(os.path.expanduser(CONFIG_DIR)) - file_name = LOG_FILE - if hasattr(config, 'logging') and 'LOG_DIR' in config.logging: - log_directory = config.logging.get('LOG_DIR') - file_name = config.logging.get('LOG_FILE') + if hasattr(config, 'logging'): + log_directory = config.logging.get( + 'LOG_DIR', os.path.join(os.path.expanduser(CONFIG_DIR))) + file_name = config.logging.get('LOG_FILE', LOG_FILE) + console_level = config.logging.get('DEBUG_LEVEL', logging.WARNING) - logging.basicConfig(level=logging.DEBUG, - format=LOG_FMT, - datefmt=DATE_FMT) + if not os.path.exists(log_directory): + os.makedir(log_directory) - file_handler = RotatingFileHandler(os.path.join(log_directory, file_name), - maxBytes=200000, backupCount=10) - console_handler = StreamHandler() - console_handler.setLevel(logging.WARNING) + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) - logging.getLogger('').addHandler(file_handler) - logging.getLogger('').addHandler(console_handler) + file_handler = RotatingFileHandler(os.path.join(log_directory, file_name), + maxBytes=200000, + backupCount=10) + file_handler.setLevel(logging.DEBUG) + file_handler.setFormatter(logging.Formatter(FILE_FORMAT_STR, + datefmt='%Y-%m-%d %H:%M')) + + console_handler = logging.StreamHandler() + console_handler.setLevel(console_level) + console_handler.setFormatter(logging.Formatter(CONSOLE_FORMAT_STR)) + + logger.addHandler(console_handler) + logger.addHandler(file_handler) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 211fd09..db1dcb4 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -12,6 +12,9 @@ URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa +import logging +logger = logging.getLogger(__name__) + def read_pipeline_definition(file_path): """Function reads the yaml pipeline definitions. @@ -99,8 +102,8 @@ def visualize_pipeline(etl, filename=None): if filename is None: raise ETLInputError('Filename must be provided for visualization') + logger.info('Creating a visualization of %s', etl.name) graph = pygraphviz.AGraph(name=etl.name, directed=True, label=etl.name) - pipeline_objects = etl.pipeline_objects() # Add nodes for all activities From 5dad409f72ee72833286e09d4b403b7fdd1a0857 Mon Sep 17 00:00:00 2001 From: sb2nov Date: Thu, 15 Jan 2015 18:23:20 -0800 Subject: [PATCH 054/175] Set correct level for the FileHandler --- bin/dataduct | 3 +-- dataduct/config/logger_config.py | 7 +++---- dataduct/etl/etl_actions.py | 3 +-- 3 files changed, 5 insertions(+), 8 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index c6b1a45..7eeb16d 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -1,7 +1,6 @@ #!/usr/bin/env python -""" -Script that helps create and validate pipelines from command line +"""Script that helps create and validate pipelines from command line """ import argparse diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py index 04a5a4d..19d0785 100644 --- a/dataduct/config/logger_config.py +++ b/dataduct/config/logger_config.py @@ -1,5 +1,4 @@ -""" -Script that has the base logger configurations +"""Script that has the base logger configurations """ import os import logging @@ -10,7 +9,7 @@ from .constants import LOG_FILE FILE_FORMAT_STR = '%(asctime)s [%(levelname)s]: %(message)s ' + \ - '[in %(module)s:%(lineno)d in %(funcName)s]' + '[in %(name)s:%(lineno)d in %(funcName)s]' CONSOLE_FORMAT_STR = '[%(levelname)s]: %(message)s' @@ -34,7 +33,7 @@ def logger_configuration(): file_handler = RotatingFileHandler(os.path.join(log_directory, file_name), maxBytes=200000, backupCount=10) - file_handler.setLevel(logging.DEBUG) + file_handler.setLevel(logging.INFO) file_handler.setFormatter(logging.Formatter(FILE_FORMAT_STR, datefmt='%Y-%m-%d %H:%M')) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index db1dcb4..a860a99 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -1,5 +1,4 @@ -""" -Script that parses the pipeline definition and has action functions +"""Script that parses the pipeline definition and has action functions """ import yaml From 2ab85b39775aaa70e194d54c910395c476ca412b Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 15 Jan 2015 18:43:19 -0800 Subject: [PATCH 055/175] Add 'activites-only' filter for etl visualization --- bin/dataduct | 12 +++++++--- dataduct/etl/etl_actions.py | 44 +++++++++++++++++++------------------ 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index 3c0776a..8957d3c 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -59,13 +59,13 @@ def pipeline_actions(action, load_definitions, force_overwrite, delay): activate_pipeline(etl) -def visualize_actions(load_definitions, filename): +def visualize_actions(load_definitions, activities_only, filename): """Visualization actions are executed in this block """ from dataduct.etl import visualize_pipeline for etl in initialize_etl_objects(load_definitions): - visualize_pipeline(etl, filename) + visualize_pipeline(etl, activities_only, filename) def main(): @@ -141,6 +141,11 @@ def main(): help='Enter the paths of the load definitions', ) + visualize_parser.add_argument( + '--activities-only', + action='store_true', + ) + args = parser.parse_args() mode = args.mode @@ -161,7 +166,8 @@ def main(): pipeline_actions(args.action, args.load_definitions, args.force_overwrite, args.delay) else: - visualize_actions(args.load_definitions, args.filename) + visualize_actions(args.load_definitions, args.activities_only, + args.filename) if __name__ == '__main__': diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 211fd09..27414a2 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -83,7 +83,7 @@ def activate_pipeline(etl): URL_TEMPLATE.format(ID=etl.pipeline.id) -def visualize_pipeline(etl, filename=None): +def visualize_pipeline(etl, activities_only, filename=None): """Visualize the pipeline that was created Args: @@ -108,27 +108,29 @@ def visualize_pipeline(etl, filename=None): if isinstance(p_object, Activity): graph.add_node(p_object.id, shape='diamond', color='turquoise', style='filled') - if isinstance(p_object, MysqlNode): - graph.add_node(p_object.id, shape='egg', color='beige', - style='filled') - if isinstance(p_object, RedshiftNode): - graph.add_node(p_object.id, shape='egg', color='goldenrod', - style='filled') - if isinstance(p_object, S3Node): - graph.add_node(p_object.id, shape='folder', color='grey', - style='filled') + if not activities_only: + if isinstance(p_object, MysqlNode): + graph.add_node(p_object.id, shape='egg', color='beige', + style='filled') + if isinstance(p_object, RedshiftNode): + graph.add_node(p_object.id, shape='egg', color='goldenrod', + style='filled') + if isinstance(p_object, S3Node): + graph.add_node(p_object.id, shape='folder', color='grey', + style='filled') # Add data dependencies - for p_object in pipeline_objects: - if isinstance(p_object, Activity): - if p_object.input: - if isinstance(p_object.input, list): - for ip in p_object.input: - graph.add_edge(ip.id, p_object.id) - else: - graph.add_edge(p_object.input.id, p_object.id) - if p_object.output: - graph.add_edge(p_object.id, p_object.output.id) + if not activities_only: + for p_object in pipeline_objects: + if isinstance(p_object, Activity): + if p_object.input: + if isinstance(p_object.input, list): + for ip in p_object.input: + graph.add_edge(ip.id, p_object.id) + else: + graph.add_edge(p_object.input.id, p_object.id) + if p_object.output: + graph.add_edge(p_object.id, p_object.output.id) # Add depends_on dependencies for p_object in pipeline_objects: @@ -143,7 +145,7 @@ def visualize_pipeline(etl, filename=None): for dependency in dependencies: graph.add_edge(dependency.id, p_object.id, color='blue') - if isinstance(p_object, S3Node): + if not activities_only and isinstance(p_object, S3Node): for dependency in p_object.dependency_nodes: graph.add_edge(dependency.id, p_object.id, color='grey') From ea24f47090bd42248dc4ead34e614e9dd0f9e457 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 15 Jan 2015 18:46:39 -0800 Subject: [PATCH 056/175] Added help message for 'activities-only' filter --- bin/dataduct | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/dataduct b/bin/dataduct index 8957d3c..eb75196 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -144,6 +144,7 @@ def main(): visualize_parser.add_argument( '--activities-only', action='store_true', + help='Visualize only activities', ) args = parser.parse_args() From b2f9659fc92eb26073c13a994c128bc791aae337 Mon Sep 17 00:00:00 2001 From: sb2nov Date: Fri, 16 Jan 2015 12:00:57 -0800 Subject: [PATCH 057/175] parser fix and basic test --- dataduct/database/parsers/create_table.py | 8 +++--- dataduct/database/parsers/create_view.py | 11 +++++--- .../parsers/tests/test_create_table.py | 27 +++++++++++++++++++ .../parsers/tests/test_create_view.py | 26 ++++++++++++++++++ dataduct/database/sql/sql_script.py | 10 +++++++ dataduct/database/table.py | 2 +- dataduct/database/view.py | 2 +- dataduct/steps/qa_transform.py | 6 +++-- 8 files changed, 80 insertions(+), 12 deletions(-) create mode 100644 dataduct/database/parsers/tests/test_create_view.py diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index 5679742..cbfc6dd 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -109,20 +109,18 @@ def get_attributes_parser(): distkey_def = _distkey + paranthesis_list('distkey') sortkey_def = _sortkey + paranthesis_list('sortkey') - return OneOrMore(diststyle_def | sortkey_def | distkey_def) + return ZeroOrMore(diststyle_def | sortkey_def | distkey_def) -def parse_create_table(statement): +def parse_create_table(string): """Parse the create table sql query and return metadata Args: - statement(SqlStatement): Input sql statement that should be parsed + string(sql): SQL string from a SQL Statement Returns: table_data(dict): table_data dictionary for instantiating a table object """ - string = statement.sql() - # Parse the base table definitions table_data = to_dict(get_base_parser().parseString(string)) diff --git a/dataduct/database/parsers/create_view.py b/dataduct/database/parsers/create_view.py index d191494..10dd63a 100644 --- a/dataduct/database/parsers/create_view.py +++ b/dataduct/database/parsers/create_view.py @@ -18,16 +18,21 @@ merge = lambda x: ' '.join(x[0]) -def parse_create_view(statement): +def rreplace(s, old, new): + li = s.rsplit(old, 1) + return new.join(li) + +def parse_create_view(string): """Parse the create view sql query and return metadata Args: - statement(SqlStatement): Input sql statement that should be parsed + string(str): Input sql string that should be parsed Returns: view_data(dict): view_data dictionary for instantiating a view object """ - string = statement.sql() + + string = rreplace(string, ')', ' )') end = ')' + StringEnd() select = Group(ZeroOrMore(~end + Word(printables))) diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py index 8b23c92..660f0a7 100644 --- a/dataduct/database/parsers/tests/test_create_table.py +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -1,3 +1,30 @@ """Tests for create table parser """ +from unittest import TestCase +from nose.tools import eq_ +from ..create_table import parse_create_table + + +class TestCreateTableStatement(TestCase): + """Tests for create table + """ + @staticmethod + def test_basic(): + """Basic test for create table + """ + query = 'CREATE TABLE orders (' +\ + 'customer_id INTEGER DISTKEY PRIMARY KEY,' +\ + 'customer_name VARCHAR(200))' + + full_name = 'orders' + temporary = False + exists_checks = False + + output = parse_create_table(query) + + eq_(output['full_name'], full_name) + eq_(output['temporary'], temporary) + eq_(output['exists_checks'], exists_checks) + eq_(len(output['constraints']), 0) + eq_(len(output['columns']), 2) diff --git a/dataduct/database/parsers/tests/test_create_view.py b/dataduct/database/parsers/tests/test_create_view.py new file mode 100644 index 0000000..cf519b4 --- /dev/null +++ b/dataduct/database/parsers/tests/test_create_view.py @@ -0,0 +1,26 @@ +"""Tests for create view parser +""" + +from unittest import TestCase +from nose.tools import eq_ +from ..create_view import parse_create_view + + +class TestCreateViewStatement(TestCase): + """Tests for create view + """ + @staticmethod + def test_basic(): + """Basic test for create view + """ + query = 'CREATE VIEW orders AS (' + \ + 'SELECT x, y, z from xyz_table)' + + full_name = 'orders' + replace = False + + output = parse_create_view(query) + + eq_(output['view_name'], full_name) + eq_(output['replace'], replace) + eq_(output['select_statement'], 'SELECT x, y, z from xyz_table') diff --git a/dataduct/database/sql/sql_script.py b/dataduct/database/sql/sql_script.py index a8cc254..8d96fd0 100644 --- a/dataduct/database/sql/sql_script.py +++ b/dataduct/database/sql/sql_script.py @@ -106,3 +106,13 @@ def wrap_transaction(self): [BeginStatement()] + self.statements + [CommitStatement()]) return new_script + + def creates_table(self): + """SQL script creates a table. + """ + return self.statements[0].creates_table() + + def creates_view(self): + """SQL script creates a view. + """ + return self.statements[0].creates_view() diff --git a/dataduct/database/table.py b/dataduct/database/table.py index f391347..9feb2ce 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -25,7 +25,7 @@ def __init__(self, sql): # Take the first statement and ignore the rest sql = SqlScript.statements[0] - parameters = parse_create_table(sql) + parameters = parse_create_table(sql.sql()) self.sql_statement = sql self.parameters = parameters diff --git a/dataduct/database/view.py b/dataduct/database/view.py index 94680f9..d9c6a76 100644 --- a/dataduct/database/view.py +++ b/dataduct/database/view.py @@ -17,7 +17,7 @@ def __init__(self, sql): # Take the first statement and ignore the rest sql = SqlScript.statements[0] - parameters = parse_create_view(sql) + parameters = parse_create_view(sql.sql()) self.sql_statement = sql self.parameters = parameters diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index bbd82a8..d7b1946 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -5,7 +5,6 @@ from ..config import Config config = Config() -SNS_TOPIC_ARN_WARNING = config.etl['SNS_TOPIC_ARN_WARNING'] class QATransformStep(TransformStep): @@ -16,7 +15,7 @@ def __init__(self, id, pipeline_name, script_arguments=None, - sns_topic_arn=SNS_TOPIC_ARN_WARNING, + sns_topic_arn=None, **kwargs): """Constructor for the QATransformStep class @@ -26,6 +25,9 @@ def __init__(self, **kwargs(optional): Keyword arguments directly passed to base class """ + if sns_topic_arn is None: + sns_topic_arn = config.etl['SNS_TOPIC_ARN_WARNING'] + if script_arguments is None: script_arguments = list() From 4bc3f5274fb2acc08c0330768ade24c955dc6c07 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Fri, 16 Jan 2015 17:23:26 -0800 Subject: [PATCH 058/175] Add database actions and visualization --- bin/dataduct | 89 ++++++++++++++++++++++++++--- dataduct/config/logger_config.py | 2 +- dataduct/database/database.py | 64 +++++++++++++++++++++ dataduct/database/sql/sql_script.py | 5 +- dataduct/database/table.py | 5 +- dataduct/utils/exceptions.py | 6 ++ examples/tables/categories.sql | 5 ++ examples/tables/customers.sql | 9 +++ examples/tables/employees.sql | 7 +++ examples/tables/order_details.sql | 6 ++ examples/tables/orders.sql | 7 +++ examples/tables/products.sql | 8 +++ examples/tables/shippers.sql | 5 ++ examples/tables/suppliers.sql | 10 ++++ 14 files changed, 216 insertions(+), 12 deletions(-) create mode 100644 examples/tables/categories.sql create mode 100644 examples/tables/customers.sql create mode 100644 examples/tables/employees.sql create mode 100644 examples/tables/order_details.sql create mode 100644 examples/tables/orders.sql create mode 100644 examples/tables/products.sql create mode 100644 examples/tables/shippers.sql create mode 100644 examples/tables/suppliers.sql diff --git a/bin/dataduct b/bin/dataduct index 2336aa3..7a8169e 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -11,11 +11,15 @@ from dataduct.config import logger_configuration CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' +DROP_STR = 'drop' +GRANT_STR = 'grant' +RECREATE_STR = 'recreate' CONFIG_TO_S3 = 'sync_to_s3' CONFIG_FROM_S3 = 'sync_from_s3' CONFIG_COMMAND = 'config' +DATABASE_COMMAND = 'database' PIPELINE_COMMAND = 'pipeline' VISUALIZE_COMMAND = 'visualize' @@ -59,15 +63,43 @@ def pipeline_actions(action, load_definitions, force_overwrite, delay): activate_pipeline(etl) -def visualize_actions(load_definitions, activities_only, filename): - """Visualization actions are executed in this block +def database_actions(action, table_definitions): + """Database related actions are executed in this block """ + from dataduct.database import Database + + database = Database(files=table_definitions) + if action == CREATE_STR: + script = database.create_relations_script() + elif action == DROP_STR: + script = database.drop_relations_script() + elif action == GRANT_STR: + script = database.grant_relations_script() + elif action == RECREATE_STR: + script = database.recreate_relations_script() + print script + + +def visualize_pipeline_actions(load_definitions, activities_only, filename): + """Visualization actions for pipelines are executed in this block + """ + from dataduct.etl import visualize_pipeline for etl in initialize_etl_objects(load_definitions): visualize_pipeline(etl, activities_only, filename) +def visualize_database_actions(table_definitions, filename): + """Visualization actions for databases are executed in this block + """ + + from dataduct.database import Database + + database = Database(files=table_definitions) + database.visualize(filename) + + def main(): """Main function""" parser = argparse.ArgumentParser(description='Run Dataduct commands') @@ -129,24 +161,59 @@ def main(): help='Delay the pipeline by x days', ) + # Database parser declaration + database_parser = subparsers.add_parser(DATABASE_COMMAND) + database_parser.add_argument( + 'action', + type=str, + choices={ + CREATE_STR: 'Create tables', + DROP_STR: 'Drop views and tables', + GRANT_STR: 'Grant permissions to neccessary groups', + RECREATE_STR: 'Recreate tables, load new data, drop old tables', + }, + ) + database_parser.add_argument( + 'table_definitions', + nargs='+', + help='Enter the paths of the load definitions', + ) + # Visualize parser declaration visualize_parser = subparsers.add_parser(VISUALIZE_COMMAND) - visualize_parser.add_argument( + visualize_subparsers = visualize_parser.add_subparsers( + help='Commands', dest='visualize_command') + + # Visualize pipeline parser declaration + visualize_pipeline_parser = \ + visualize_subparsers.add_parser(PIPELINE_COMMAND) + visualize_pipeline_parser.add_argument( 'filename', help='Filename for the graph', ) - visualize_parser.add_argument( + visualize_pipeline_parser.add_argument( 'load_definitions', nargs='+', help='Enter the paths of the load definitions', ) - - visualize_parser.add_argument( + visualize_pipeline_parser.add_argument( '--activities-only', action='store_true', help='Visualize only activities', ) + visualize_database_parser = \ + visualize_subparsers.add_parser(DATABASE_COMMAND) + visualize_database_parser.add_argument( + 'filename', + help='Filename for the graph', + ) + visualize_database_parser.add_argument( + 'table_definitions', + nargs='+', + help='Enter the paths of the table definitions', + ) + args = parser.parse_args() mode = args.mode @@ -169,9 +236,15 @@ def main(): elif args.command == PIPELINE_COMMAND: pipeline_actions(args.action, args.load_definitions, args.force_overwrite, args.delay) + elif args.command == DATABASE_COMMAND: + database_actions(args.action, args.table_definitions) else: - visualize_actions(args.load_definitions, args.activities_only, - args.filename) + if args.visualize_command == PIPELINE_COMMAND: + visualize_pipeline_actions( + args.load_definitions, args.activities_only, args.filename) + else: + visualize_database_actions( + args.table_definitions, args.filename) if __name__ == '__main__': diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py index 19d0785..79cd596 100644 --- a/dataduct/config/logger_config.py +++ b/dataduct/config/logger_config.py @@ -25,7 +25,7 @@ def logger_configuration(): console_level = config.logging.get('DEBUG_LEVEL', logging.WARNING) if not os.path.exists(log_directory): - os.makedir(log_directory) + os.mkdir(log_directory) logger = logging.getLogger() logger.setLevel(logging.DEBUG) diff --git a/dataduct/database/database.py b/dataduct/database/database.py index 6a0b36a..585a9d0 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -10,6 +10,10 @@ from ..utils.helpers import atmost_one from ..utils.helpers import parse_path +from ..utils.exceptions import DatabaseInputError + +import logging +logger = logging.getLogger(__name__) class Database(object): """Class representing a database @@ -61,6 +65,7 @@ def add_relation(self, relation): self._relations[relation.full_name] = relation + @property def relations(self): """Unsorted list of relations of the database """ @@ -152,6 +157,11 @@ def create_relations_script(self, grant_permissions=True): return self.relations_script( 'create_script', grant_permissions=grant_permissions) + def drop_relations_script(self): + """SQL Script for dropping all the relations for the database + """ + return self.relations_script('drop_script') + def recreate_relations_script(self, grant_permissions=True): """SQL Script for recreating all the relations of the database """ @@ -183,3 +193,57 @@ def recreate_table_dependencies(self, table_name): if table_name in relation.dependencies: result.append(relation.recreate_script()) return result + + def _make_node_label(self, relation): + """ + Output: a pydot format of this table. + """ + html_lines = ['<'] + html_lines += [''] + for col in sorted(relation.columns, key=lambda x: x.position): + col_name = col.name + (' PK' if col.primary else '') + html_lines += [''] + html_lines += ['
' + relation.full_name + + '
' + + col_name + '
>'] + + return '\n'.join(html_lines) + + def visualize(self, filename=None): + """Visualize databases + """ + # Import pygraphviz for plotting the graphs + try: + import pygraphviz + except ImportError: + raise ImportError('Install pygraphviz for visualizing databases') + + if filename is None: + raise DatabaseInputError( + 'Filename must be provided for visualization') + + logger.info('Creating a visualization of the database') + graph = pygraphviz.AGraph( + name='database', label='database') + + # Add nodes + for relation in self.relations: + if isinstance(relation, Table): + graph.add_node(relation.full_name) + node = graph.get_node(relation.full_name) + node.attr['label'] = self._make_node_label(relation) + node.attr['shape'] = 'none' + + # Add edges + for relation in self.relations: + if isinstance(relation, Table): + for cols, ref_table_name, ref_col_names in \ + relation.foreign_key_references: + # ref_name = ref_table_name + \ + # ':' + ref_col_names + graph.add_edge(relation.full_name, ref_table_name) + # graph.add_edge(t.full_name + ":" + cols[0], ref_name) + + # Plotting the graph with dot layout + graph.layout(prog='dot') + graph.draw(filename) diff --git a/dataduct/database/sql/sql_script.py b/dataduct/database/sql/sql_script.py index 8d96fd0..79b5742 100644 --- a/dataduct/database/sql/sql_script.py +++ b/dataduct/database/sql/sql_script.py @@ -16,7 +16,7 @@ class SqlScript(object): def __init__(self, sql=None, statements=None, filename=None): """Constructor for the SqlScript class """ - assert atmost_one(sql, statements, filename), 'Multiple intializer' + assert atmost_one(sql, statements, filename), 'Multiple initializer' if sql is None: sql = '' @@ -77,6 +77,9 @@ def copy(self): def append(self, elements): """Append the elements to the SQL script """ + if elements is None: + return self.copy() + if isinstance(elements, SqlStatement): self.add_statement(elements) return self.copy() diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 9feb2ce..b6dec09 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -23,7 +23,7 @@ def __init__(self, sql): if isinstance(sql, SqlScript): # Take the first statement and ignore the rest - sql = SqlScript.statements[0] + sql = sql.statements[0] parameters = parse_create_table(sql.sql()) @@ -89,7 +89,8 @@ def primary_key_names(self): """ return [c.name for c in self.columns if c.primary] - def forign_key_references(self): + @property + def foreign_key_references(self): """Get a list of all foreign key references from the table """ result = list() diff --git a/dataduct/utils/exceptions.py b/dataduct/utils/exceptions.py index 178517a..5ae1dd0 100644 --- a/dataduct/utils/exceptions.py +++ b/dataduct/utils/exceptions.py @@ -5,3 +5,9 @@ class ETLInputError(Exception): pass class ETLConfigError(Exception): pass + +""" +Exceptions for database +""" + +class DatabaseInputError(Exception): pass diff --git a/examples/tables/categories.sql b/examples/tables/categories.sql new file mode 100644 index 0000000..54848f0 --- /dev/null +++ b/examples/tables/categories.sql @@ -0,0 +1,5 @@ +CREATE TABLE categories ( + category_id INTEGER DISTKEY PRIMARY KEY + ,category_name VARCHAR(100) + ,description VARCHAR(2000) +) SORTKEY(category_id); diff --git a/examples/tables/customers.sql b/examples/tables/customers.sql new file mode 100644 index 0000000..bd55bb0 --- /dev/null +++ b/examples/tables/customers.sql @@ -0,0 +1,9 @@ +CREATE TABLE customers ( + customer_id INTEGER DISTKEY PRIMARY KEY + ,customer_name VARCHAR(200) + ,contact_name VARCHAR(200) + ,address VARCHAR(200) + ,city VARCHAR(100) + ,postal_code VARCHAR(10) + ,country VARCHAR(100) +) SORTKEY(customer_id); diff --git a/examples/tables/employees.sql b/examples/tables/employees.sql new file mode 100644 index 0000000..fbbcf9a --- /dev/null +++ b/examples/tables/employees.sql @@ -0,0 +1,7 @@ +CREATE TABLE employees ( + employee_id INTEGER DISTKEY PRIMARY KEY + ,last_name VARCHAR(100) + ,first_name VARCHAR(100) + ,birth_date DATE + ,notes VARCHAR(2000) +) SORTKEY(employee_id); diff --git a/examples/tables/order_details.sql b/examples/tables/order_details.sql new file mode 100644 index 0000000..e0f2f75 --- /dev/null +++ b/examples/tables/order_details.sql @@ -0,0 +1,6 @@ +CREATE TABLE order_details ( + order_detail_id INTEGER DISTKEY PRIMARY KEY + ,order_id INTEGER REFERENCES orders(order_id) + ,product_id INTEGER REFERENCES products(product_id) + ,quantity INTEGER +) SORTKEY(order_detail_id); diff --git a/examples/tables/orders.sql b/examples/tables/orders.sql new file mode 100644 index 0000000..c332965 --- /dev/null +++ b/examples/tables/orders.sql @@ -0,0 +1,7 @@ +CREATE TABLE orders ( + order_id INTEGER DISTKEY PRIMARY KEY + ,customer_id INTEGER REFERENCES customers(customer_id) + ,employee_id INTEGER REFERENCES employees(employee_id) + ,order_date DATE + ,shipper_id INTEGER REFERENCES shippers(shipper_id) +) SORTKEY(order_id); diff --git a/examples/tables/products.sql b/examples/tables/products.sql new file mode 100644 index 0000000..356198d --- /dev/null +++ b/examples/tables/products.sql @@ -0,0 +1,8 @@ +CREATE TABLE products ( + product_id INTEGER DISTKEY PRIMARY KEY + ,product_name VARCHAR(200) + ,supplier_id INTEGER REFERENCES suppliers(supplier_id) + ,category_id INTEGER REFERENCES categories(category_id) + ,unit VARCHAR(200) + ,price REAL +) SORTKEY(product_id); diff --git a/examples/tables/shippers.sql b/examples/tables/shippers.sql new file mode 100644 index 0000000..bed1454 --- /dev/null +++ b/examples/tables/shippers.sql @@ -0,0 +1,5 @@ +CREATE TABLE shippers ( + shipper_id INTEGER DISTKEY PRIMARY KEY + ,shipper_name VARCHAR(200) + ,phone VARCHAR(20) +) SORTKEY(shipper_id); diff --git a/examples/tables/suppliers.sql b/examples/tables/suppliers.sql new file mode 100644 index 0000000..d70e7f1 --- /dev/null +++ b/examples/tables/suppliers.sql @@ -0,0 +1,10 @@ +CREATE TABLE suppliers ( + supplier_id INTEGER DISTKEY PRIMARY KEY + ,supplier_name VARCHAR(200) + ,contact_name VARCHAR(200) + ,address VARCHAR(200) + ,city VARCHAR(100) + ,postal_code VARCHAR(10) + ,county VARCHAR(100) + ,phone VARCHAR(20) +) SORTKEY(supplier_id); From 56222d1a339d13c607231e083c45c020fb5367fd Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Fri, 16 Jan 2015 22:34:34 -0800 Subject: [PATCH 059/175] Code review changes --- bin/dataduct | 2 +- dataduct/database/database.py | 10 ++++------ dataduct/database/table.py | 1 - dataduct/utils/exceptions.py | 6 ++---- 4 files changed, 7 insertions(+), 12 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index 7a8169e..2221c8a 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -176,7 +176,7 @@ def main(): database_parser.add_argument( 'table_definitions', nargs='+', - help='Enter the paths of the load definitions', + help='Enter the paths of the table definitions', ) # Visualize parser declaration diff --git a/dataduct/database/database.py b/dataduct/database/database.py index 585a9d0..d608a36 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -65,7 +65,6 @@ def add_relation(self, relation): self._relations[relation.full_name] = relation - @property def relations(self): """Unsorted list of relations of the database """ @@ -195,8 +194,7 @@ def recreate_table_dependencies(self, table_name): return result def _make_node_label(self, relation): - """ - Output: a pydot format of this table. + """Create the html table layout for graph nodes """ html_lines = ['<'] html_lines += ['' + for column in sorted(relation.columns, key=lambda x: x.position): + columns.append(row.format(col_name=column.name, + pk=' (PK)' if column.primary else '')) - return '\n'.join(html_lines) + layout = ('<
' + relation.full_name + @@ -227,7 +225,7 @@ def visualize(self, filename=None): name='database', label='database') # Add nodes - for relation in self.relations: + for relation in self.relations(): if isinstance(relation, Table): graph.add_node(relation.full_name) node = graph.get_node(relation.full_name) @@ -235,10 +233,10 @@ def visualize(self, filename=None): node.attr['shape'] = 'none' # Add edges - for relation in self.relations: + for relation in self.relations(): if isinstance(relation, Table): for cols, ref_table_name, ref_col_names in \ - relation.foreign_key_references: + relation.foreign_key_references(): # ref_name = ref_table_name + \ # ':' + ref_col_names graph.add_edge(relation.full_name, ref_table_name) diff --git a/dataduct/database/table.py b/dataduct/database/table.py index b6dec09..a12ebe2 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -89,7 +89,6 @@ def primary_key_names(self): """ return [c.name for c in self.columns if c.primary] - @property def foreign_key_references(self): """Get a list of all foreign key references from the table """ diff --git a/dataduct/utils/exceptions.py b/dataduct/utils/exceptions.py index 5ae1dd0..dcd917a 100644 --- a/dataduct/utils/exceptions.py +++ b/dataduct/utils/exceptions.py @@ -1,13 +1,11 @@ -""" -Exceptions for etl_lib +"""Exceptions for etl_lib """ class ETLInputError(Exception): pass class ETLConfigError(Exception): pass -""" -Exceptions for database +"""Exceptions for database """ class DatabaseInputError(Exception): pass From 26c88c236bee17f5e6f04db4b3fc7b11341b6c21 Mon Sep 17 00:00:00 2001 From: sb2nov Date: Sat, 17 Jan 2015 01:07:44 -0800 Subject: [PATCH 060/175] Table references from columns --- dataduct/database/database.py | 52 +++++++++++++++++------------------ dataduct/database/table.py | 2 +- dataduct/utils/exceptions.py | 5 +--- 3 files changed, 27 insertions(+), 32 deletions(-) diff --git a/dataduct/database/database.py b/dataduct/database/database.py index d608a36..6c9f1a4 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -179,7 +179,7 @@ def recreate_table_dependencies(self, table_name): if isinstance(relation, Table): # Recreate foreign key relations for column_names, ref_name, ref_columns in \ - relation.forign_key_references(): + relation.foreign_key_references(): if ref_name == table_name: result.append( relation.foreign_key_reference_script( @@ -193,19 +193,21 @@ def recreate_table_dependencies(self, table_name): result.append(relation.recreate_script()) return result - def _make_node_label(self, relation): - """Create the html table layout for graph nodes + @staticmethod + def _make_node_label(relation): + """Create the table layout for graph nodes """ - html_lines = ['<'] - html_lines += [''] - for col in sorted(relation.columns, key=lambda x: x.position): - col_name = col.name + (' PK' if col.primary else '') - html_lines += [''] - html_lines += ['
' + relation.full_name + - '
' + - col_name + '
>'] + columns = list() + row = '
{col_name}{pk}
\n' + '\n' + '{columns}
{table_name}
>').format(table_name=relation.full_name, + columns='\n'.join(columns)) + return layout def visualize(self, filename=None): """Visualize databases @@ -222,25 +224,21 @@ def visualize(self, filename=None): logger.info('Creating a visualization of the database') graph = pygraphviz.AGraph( - name='database', label='database') + name='Database', label='Database') + + tables = [r for r in self.relations() if isinstance(r, Table)] # Add nodes - for relation in self.relations(): - if isinstance(relation, Table): - graph.add_node(relation.full_name) - node = graph.get_node(relation.full_name) - node.attr['label'] = self._make_node_label(relation) - node.attr['shape'] = 'none' + for table in tables: + graph.add_node(table.full_name, shape='none', + label=self._make_node_label(table)) # Add edges - for relation in self.relations(): - if isinstance(relation, Table): - for cols, ref_table_name, ref_col_names in \ - relation.foreign_key_references(): - # ref_name = ref_table_name + \ - # ':' + ref_col_names - graph.add_edge(relation.full_name, ref_table_name) - # graph.add_edge(t.full_name + ":" + cols[0], ref_name) + for table in tables: + for cols, ref_table, ref_cols in table.foreign_key_references(): + graph.add_edge(ref_table, table.full_name, tailport=ref_cols[0], + headport=cols[0], dir='both', arrowhead='crow', + arrowtail='dot') # Plotting the graph with dot layout graph.layout(prog='dot') diff --git a/dataduct/database/table.py b/dataduct/database/table.py index a12ebe2..ed7bcfe 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -96,7 +96,7 @@ def foreign_key_references(self): for column in self.columns: if column.fk_table is not None: result.append(( - [column.name], column.fk_table, column.fk_reference)) + [column.name], column.fk_table, [column.fk_reference])) for constraint in self._constraints: if 'fk_table' in constraint: diff --git a/dataduct/utils/exceptions.py b/dataduct/utils/exceptions.py index dcd917a..1bb122c 100644 --- a/dataduct/utils/exceptions.py +++ b/dataduct/utils/exceptions.py @@ -1,11 +1,8 @@ -"""Exceptions for etl_lib +"""Exceptions for dataduct """ class ETLInputError(Exception): pass class ETLConfigError(Exception): pass -"""Exceptions for database -""" - class DatabaseInputError(Exception): pass From 7276870523cd8a4d4fa2907398ecbc1458e5359b Mon Sep 17 00:00:00 2001 From: sb2nov Date: Sat, 17 Jan 2015 01:11:41 -0800 Subject: [PATCH 061/175] nit picks --- dataduct/database/database.py | 11 +++++++---- dataduct/etl/etl_actions.py | 3 ++- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/dataduct/database/database.py b/dataduct/database/database.py index 6c9f1a4..ba6ae97 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -210,21 +210,24 @@ def _make_node_label(relation): return layout def visualize(self, filename=None): - """Visualize databases + """Visualize databases and create an er-diagram + + Args: + filename(str): filepath for saving the er-diagram """ # Import pygraphviz for plotting the graphs try: import pygraphviz except ImportError: - raise ImportError('Install pygraphviz for visualizing databases') + logger.error('Install pygraphviz for visualizing databases') + raise if filename is None: raise DatabaseInputError( 'Filename must be provided for visualization') logger.info('Creating a visualization of the database') - graph = pygraphviz.AGraph( - name='Database', label='Database') + graph = pygraphviz.AGraph(name='Database', label='Database') tables = [r for r in self.relations() if isinstance(r, Table)] diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 8ff96c4..23acc1e 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -96,7 +96,8 @@ def visualize_pipeline(etl, activities_only, filename=None): try: import pygraphviz except ImportError: - raise ImportError('Install pygraphviz for visualizing pipelines') + logger.error('Install pygraphviz for visualizing pipelines') + raise if filename is None: raise ETLInputError('Filename must be provided for visualization') From bc8fffbcfdaba2d9930ad017c65e53365f488793 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Mon, 19 Jan 2015 18:44:16 -0800 Subject: [PATCH 062/175] Add many unit tests, fixed some bugs --- bin/dataduct | 1 + dataduct/database/database.py | 6 +- .../parsers/tests/test_create_table.py | 29 ++ dataduct/database/select_statement.py | 4 +- .../database/sql/tests/test_sql_script.py | 108 +++++-- dataduct/database/table.py | 3 +- dataduct/database/tests/test_database.py | 270 ++++++++++++++++++ dataduct/database/view.py | 3 +- dataduct/etl/tests/test_definition_parser.py | 15 +- dataduct/tests/test_import.py | 7 + requirements.txt | 1 + 11 files changed, 416 insertions(+), 31 deletions(-) create mode 100644 dataduct/database/tests/test_database.py diff --git a/bin/dataduct b/bin/dataduct index 2221c8a..b5ab723 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -202,6 +202,7 @@ def main(): help='Visualize only activities', ) + # Visualize database parser declaration visualize_database_parser = \ visualize_subparsers.add_parser(DATABASE_COMMAND) visualize_database_parser.add_argument( diff --git a/dataduct/database/database.py b/dataduct/database/database.py index ba6ae97..aa64d78 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -105,9 +105,9 @@ def has_cycles(self, relation=None, visited=None): if relation.full_name in visited: return True # Make a copy for immutability - visited = deepcopy(visited) - visited.append(relation.full_name) - if self.has_cycles(relation, visited): + visited_copy = deepcopy(visited) + visited_copy.append(relation.full_name) + if self.has_cycles(relation, visited_copy): return True return False diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py index 660f0a7..6a346cd 100644 --- a/dataduct/database/parsers/tests/test_create_table.py +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -3,6 +3,9 @@ from unittest import TestCase from nose.tools import eq_ + +from pyparsing import ParseException + from ..create_table import parse_create_table @@ -28,3 +31,29 @@ def test_basic(): eq_(output['exists_checks'], exists_checks) eq_(len(output['constraints']), 0) eq_(len(output['columns']), 2) + + @staticmethod + def test_bad_input(): + """Feeding malformed input into create table + """ + query = 'CREATE TABLE orders (' +\ + 'customer_id INTEGER DISTKEY PRIMARY KEY' + + try: + parse_create_table(query) + assert False + except ParseException: + pass + + @staticmethod + def test_bad_input_in_columns(): + """Feeding malformed input into create table + """ + query = 'CREATE TABLE orders (' +\ + 'customer_id NEGATIVE DISTKEY PRIMARY KEY)' + + try: + parse_create_table(query) + assert False + except ParseException: + pass diff --git a/dataduct/database/select_statement.py b/dataduct/database/select_statement.py index 32ee3d0..8c9e37e 100644 --- a/dataduct/database/select_statement.py +++ b/dataduct/database/select_statement.py @@ -16,8 +16,8 @@ def __init__(self, sql): """ super(SelectStatement, self).__init__(sql) - self._dependencies = parse_select_dependencies(self.sql()) - self._raw_columns = parse_select_columns(self.sql()) + self._dependencies = parse_select_dependencies(self) + self._raw_columns = parse_select_columns(self) self._columns = [ Column(parse_column_name(c), None) for c in self._raw_columns] diff --git a/dataduct/database/sql/tests/test_sql_script.py b/dataduct/database/sql/tests/test_sql_script.py index 44e9d37..8b1eeef 100644 --- a/dataduct/database/sql/tests/test_sql_script.py +++ b/dataduct/database/sql/tests/test_sql_script.py @@ -15,8 +15,8 @@ class TestSqlScript(TestCase): def test_basic(): """Basic test for Script declaration """ - query = 'select \n 1;' - result = 'select 1;' + query = 'SELECT \n 1;' + result = 'SELECT 1;' eq_(SqlScript(query).sql(), result) @@ -24,8 +24,8 @@ def test_basic(): def test_sanatization(): """Sanatization of comments """ - query = 'select 1 -- test connect \n;' - result = 'select 1;' + query = 'SELECT 1 -- test connect \n;' + result = 'SELECT 1;' eq_(SqlScript(query).sql(), result) @@ -33,8 +33,8 @@ def test_sanatization(): def test_multiple_queries(): """Raise error if multiple queries are passed """ - query = 'select 1; select 2;' - result = 'select 1;\nselect 2;' + query = 'SELECT 1; SELECT 2;' + result = 'SELECT 1;\nSELECT 2;' eq_(SqlScript(query).sql(), result) @staticmethod @@ -47,7 +47,7 @@ def test_empty_declaration(): def test_length(): """Length of sql script """ - query = 'select 1; select 2;' + query = 'SELECT 1; SELECT 2;' result = 2 eq_(len(SqlScript(query)), result) @@ -56,31 +56,31 @@ def test_append_statement(): """Appending a statement to sql script """ script = SqlScript() - script.append(SqlStatement('Select 1')) - eq_(script.sql(), 'Select 1;') + script.append(SqlStatement('SELECT 1')) + eq_(script.sql(), 'SELECT 1;') @staticmethod def test_append_script(): """Appending a script to sql script """ - script = SqlScript('Select 1;') - script_new = SqlScript('Select 2;') + script = SqlScript('SELECT 1;') + script_new = SqlScript('SELECT 2;') script.append(script_new) - eq_(script.sql(), 'Select 1;\nSelect 2;') + eq_(script.sql(), 'SELECT 1;\nSELECT 2;') @staticmethod def test_append_string(): """Appending a string to sql script """ - script = SqlScript('Select 1;') - script.append('Select 2;') - eq_(script.sql(), 'Select 1;\nSelect 2;') + script = SqlScript('SELECT 1;') + script.append('SELECT 2;') + eq_(script.sql(), 'SELECT 1;\nSELECT 2;') @staticmethod def test_copy(): """Copy a sql script """ - script = SqlScript('Select 1;') + script = SqlScript('SELECT 1;') script_new = script.copy() eq_(script.sql(), script_new.sql()) @@ -91,14 +91,82 @@ def test_copy(): def test_wrap_transaction(): """Wrap the sql script in a transaction """ - script = SqlScript('Select 1;').wrap_transaction() - result = 'BEGIN;\nSelect 1;\nCOMMIT;' + script = SqlScript('SELECT 1;').wrap_transaction() + result = 'BEGIN;\nSELECT 1;\nCOMMIT;' eq_(script.sql(), result) @staticmethod def test_paranthesis(): """Test sql with paranthesis is sanatized correctly """ - script = SqlScript('create table test_begin (session_id INTEGER);') - result = 'create table test_begin (session_id INTEGER);' + script = SqlScript('CREATE TABLE test_begin (session_id INTEGER);') + result = 'CREATE TABLE test_begin (session_id INTEGER);' eq_(script.sql(), result) + + @staticmethod + def test_creates_table_success(): + """Correctly recognizes that the sql creates a table + """ + script = SqlScript('CREATE TABLE test_begin (session_id INTEGER);') + eq_(script.creates_table(), True) + + @staticmethod + def test_creates_table_failure(): + """Correctly recognizes that the sql does not create a table + """ + script = SqlScript('SELECT * FROM test_begin;') + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_table_failure_not_first_statement(): + """Correctly recognizes that the first sql statement does not create + a table + """ + script = SqlScript(""" + SELECT * FROM test_begin; + CREATE TABLE test_begin (session_id INTEGER); + """) + + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_table_failure_bad_syntax(): + """Correctly recognizes bad syntax when creating a view + """ + script = SqlScript( + 'CREATE TABLE test_begin AS (SELECT * FROM test_table);') + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_view_success(): + """Correctly recognizes that the sql creates a view + """ + script = SqlScript( + 'CREATE VIEW test_begin AS (SELECT * FROM test_table);') + eq_(script.creates_view(), True) + + @staticmethod + def test_creates_view_failure(): + """Correctly recognizes that the sql does not create a view + """ + script = SqlScript('SELECT * FROM test_begin;') + eq_(script.creates_table(), False) + + @staticmethod + def test_creates_view_failure_not_first_statement(): + """Correctly recognizes that the first sql statment does not create + a view + """ + script = SqlScript(""" + SELECT * FROM test_begin; + CREATE VIEW test_begin AS (SELECT * FROM test_table); + """) + + eq_(script.creates_view(), False) + + @staticmethod + def test_creates_view_failure_bad_syntax(): + """Correctly recognizes bad syntax when creating a view + """ + script = SqlScript('CREATE VIEW test_begin (session_id INTEGER);') + eq_(script.creates_view(), False) diff --git a/dataduct/database/table.py b/dataduct/database/table.py index ed7bcfe..6a81e37 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -109,7 +109,8 @@ def foreign_key_references(self): def dependencies(self): """List of tables which this table references. """ - return [table_name for _, table_name, _ in self.foreign_key_references] + return [table_name for _, table_name, _ + in self.foreign_key_references()] def temporary_clone_script(self): """Sql script to create a temporary clone table diff --git a/dataduct/database/tests/test_database.py b/dataduct/database/tests/test_database.py new file mode 100644 index 0000000..5417f7a --- /dev/null +++ b/dataduct/database/tests/test_database.py @@ -0,0 +1,270 @@ +"""Tests for Database +""" +import os + +from unittest import TestCase +from testfixtures import TempDirectory +from nose.tools import assert_not_equal +from nose.tools import eq_ + +from ..database import Database +from ..table import Table +from ..view import View +from ..sql import SqlScript + + +class TestDatabase(TestCase): + """Tests for Database + """ + + @staticmethod + def _create_table(sql): + """Creates a table object from a SQL string + """ + return Table(SqlScript(sql)) + + @staticmethod + def _create_view(sql): + """Creates a view object from a SQL string + """ + return View(SqlScript(sql)) + + def test_create(self): + """Tests database initialization + """ + table = self._create_table('CREATE TABLE test_begin (id INTEGER);') + database = Database(relations=[table]) + + # Verify that the database is constructed properly + eq_(database.num_tables, 1) + eq_(database.num_views, 0) + assert_not_equal(database.relation('test_begin'), None) + + @staticmethod + def test_create_from_file(): + """Tests database initialization from file + """ + with TempDirectory() as d: + # Create files in the temp directory + d.write('test_table.sql', + 'CREATE TABLE test_table (session_id INTEGER);') + d.write('test_view.sql', + 'CREATE VIEW test_view AS (SELECT * FROM test_table);') + database = Database(files=[os.path.join(d.path, 'test_table.sql'), + os.path.join(d.path, 'test_view.sql')]) + + # Verify that the database is constructed properly + eq_(database.num_tables, 1) + eq_(database.num_views, 1) + assert_not_equal(database.relation('test_table'), None) + assert_not_equal(database.relation('test_view'), None) + + @staticmethod + def test_create_from_file_no_relation(): + """Database initialization with a file that does not create a + relation + """ + with TempDirectory() as d: + # Create a file in the temp directory + d.write('test.sql', + 'SELECT * FROM test_table;') + try: + Database(files=[os.path.join(d.path, 'test.sql')]) + assert False + except ValueError: + pass + + @staticmethod + def test_create_two_arguments(): + """Must create database with less than two arguments + """ + try: + Database(relations=['test_rel'], files=['test_file']) + assert False + except ValueError: + pass + + def test_create_duplicate_relations(self): + """Database initalization with duplicate relations + """ + table = self._create_table( + 'CREATE TABLE test_begin (session_id INTEGER);') + try: + Database(relations=[table, table]) + assert False + except ValueError: + pass + + def test_database_copy(self): + """Copying a database is a deepcopy + """ + table = self._create_table( + 'CREATE TABLE test_begin (session_id INTEGER);') + database = Database(relations=[table]) + database_copy = database.copy() + + # Check that the copied database contains the relation + assert_not_equal(database_copy.relation('test_begin'), None) + + # Delete the relation in the copy + database_copy._relations = {} + + # Check that the original database still contains the relation + assert_not_equal(database.relation('test_begin'), None) + + def test_database_has_cycles(self): + """Check if a database has cycles + """ + first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER REFERENCES second_table(id2) + );""") + second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER REFERENCES first_table(id1), + id2 INTEGER + );""") + + database = Database(relations=[first_table, second_table]) + eq_(database.has_cycles(), True) + + def test_database_has_no_cycles(self): + """Check if a database has no cycles + """ + first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER REFERENCES second_table(id2) + );""") + second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER, + id2 INTEGER + );""") + + database = Database(relations=[first_table, second_table]) + eq_(database.has_cycles(), False) + + def test_database_has_no_cycles_2(self): + """Check if a database has no cycles + """ + first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER + );""") + second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER REFERENCES first_table(id1), + id2 INTEGER + );""") + + database = Database(relations=[first_table, second_table]) + eq_(database.has_cycles(), False) + + def test_database_sorted_relations(self): + """Get the topological sort of the database + """ + first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER REFERENCES second_table(id2) + );""") + second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER, + id2 INTEGER + );""") + + database = Database(relations=[first_table, second_table]) + relations = database.sorted_relations() + + # Verify that the relations are sorted correctly + eq_(len(relations), 2) + eq_(relations[0].table_name, 'second_table') + eq_(relations[1].table_name, 'first_table') + + def test_database_sorted_relations_cyclic(self): + """Get the topological sort of the database with cycles + """ + first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER REFERENCES second_table(id2) + );""") + second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER REFERENCES first_table(id1), + id2 INTEGER + );""") + + database = Database(relations=[first_table, second_table]) + try: + database.sorted_relations() + assert False + except RuntimeError: + pass + + def _test_database_scripts(self, function_name, expected_sql, **kwargs): + """Generate SQL scripts with a preset database + """ + table = self._create_table('CREATE TABLE test_table ( id INTEGER );') + view = self._create_view("""CREATE VIEW test_view AS ( + SELECT id FROM test_table + );""") + database = Database(relations=[table, view]) + func = getattr(database, function_name) + eq_(func(**kwargs).sql(), expected_sql) + + def test_database_create_relations_script(self): + """Creating relations in the database + """ + self._test_database_scripts( + 'create_relations_script', + 'CREATE TABLE test_table ( id INTEGER );\n' + 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') + + def test_database_drop_relations_script(self): + """Dropping relations in the database + """ + self._test_database_scripts( + 'drop_relations_script', + 'DROP TABLE IF EXISTS test_table CASCADE;\n' + 'DROP VIEW IF EXISTS test_view CASCADE;') + + def test_database_recreate_relations_script(self): + """Recreating relations in the database + """ + self._test_database_scripts( + 'recreate_relations_script', + 'DROP TABLE IF EXISTS test_table CASCADE;\n' + 'CREATE TABLE test_table ( id INTEGER );\n' + 'DROP VIEW IF EXISTS test_view CASCADE;\n' + 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') + + def test_database_recreate_table_dependencies(self): + """Recreating table dependencies + """ + first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER REFERENCES second_table(id2) + );""") + second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER, + id2 INTEGER + );""") + view = self._create_view( + """CREATE VIEW view AS ( + SELECT id1 FROM second_table + );""") + database = Database(relations=[first_table, second_table, view]) + + eq_(database.recreate_table_dependencies('second_table').sql(), + 'ALTER TABLE first_table ADD FOREIGN KEY (id2) ' + 'REFERENCES second_table (id2);\n' + 'DROP VIEW IF EXISTS view CASCADE;\n' + 'CREATE VIEW view AS ( SELECT id1 FROM second_table );') + eq_(database.recreate_table_dependencies('first_table').sql(), ';') diff --git a/dataduct/database/view.py b/dataduct/database/view.py index d9c6a76..e90a9ac 100644 --- a/dataduct/database/view.py +++ b/dataduct/database/view.py @@ -13,9 +13,10 @@ def __init__(self, sql): """Constructor for view class """ super(View, self).__init__() + if isinstance(sql, SqlScript): # Take the first statement and ignore the rest - sql = SqlScript.statements[0] + sql = sql.statements[0] parameters = parse_create_view(sql.sql()) diff --git a/dataduct/etl/tests/test_definition_parser.py b/dataduct/etl/tests/test_definition_parser.py index c9ec414..f66ab1f 100644 --- a/dataduct/etl/tests/test_definition_parser.py +++ b/dataduct/etl/tests/test_definition_parser.py @@ -3,8 +3,11 @@ Tests for the definition parser functions """ import unittest +from ..etl_actions import read_pipeline_definition +from ...utils.exceptions import ETLInputError -class DefitionParserTests(unittest.TestCase): + +class DefinitionParserTests(unittest.TestCase): """Tests for the definition parser. """ @@ -13,7 +16,11 @@ def setUp(self): """ pass - def test_yaml_extention(self): - """Test if the pipeline extention is yaml + def test_yaml_extension(self): + """Test if the yaml extension check works correctly """ - pass + try: + read_pipeline_definition("name.txt") + assert False + except ETLInputError: + pass diff --git a/dataduct/tests/test_import.py b/dataduct/tests/test_import.py index 4146bb4..786a6df 100644 --- a/dataduct/tests/test_import.py +++ b/dataduct/tests/test_import.py @@ -68,3 +68,10 @@ def test_sphinx_rtd_theme(): """ print 'Trying to import sphinx_rtd_theme' import sphinx_rtd_theme + + @staticmethod + def test_testfixtures(): + """Testing testfixtures + """ + print 'Trying to import testfixtures' + import testfixtures diff --git a/requirements.txt b/requirements.txt index edaa036..8085d0d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,4 @@ PyYAML coverage pyparsing>=2 pygraphviz +testfixtures>=4.1.1 From 18b0d1b531cf1f5fef09ed3091c380447bc8b6a9 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 19 Jan 2015 21:24:01 -0800 Subject: [PATCH 063/175] Primary Key Step --- dataduct/data_access/__init__.py | 2 + dataduct/data_access/connection.py | 1 + dataduct/etl/etl_actions.py | 2 +- dataduct/etl/etl_pipeline.py | 4 ++ dataduct/qa/utils.py | 4 +- dataduct/steps/__init__.py | 1 + dataduct/steps/primary_key_check.py | 34 ++++++++++++++++ dataduct/steps/scripts/primary_key_test.py | 35 ++++++++++++++++ dataduct/steps/transform.py | 11 +---- dataduct/utils/constants.py | 1 + examples/example_primary_key_check.yaml | 9 +++++ examples/scripts/primary_key_test.py | 47 ---------------------- examples/tables/dev.test_table.sql | 4 ++ setup.py | 15 ++----- 14 files changed, 100 insertions(+), 70 deletions(-) create mode 100644 dataduct/steps/primary_key_check.py create mode 100644 dataduct/steps/scripts/primary_key_test.py create mode 100644 examples/example_primary_key_check.yaml delete mode 100644 examples/scripts/primary_key_test.py create mode 100644 examples/tables/dev.test_table.sql diff --git a/dataduct/data_access/__init__.py b/dataduct/data_access/__init__.py index e69de29..a22c888 100644 --- a/dataduct/data_access/__init__.py +++ b/dataduct/data_access/__init__.py @@ -0,0 +1,2 @@ +from .connection import redshift_connection +from .connection import rds_connection diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py index 27fe3e9..742076a 100644 --- a/dataduct/data_access/connection.py +++ b/dataduct/data_access/connection.py @@ -3,6 +3,7 @@ """ import psycopg2 import MySQLdb +import MySQLdb.cursors from ..config import Config from ..utils.helpers import retry diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 23acc1e..441a738 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -85,7 +85,7 @@ def activate_pipeline(etl): URL_TEMPLATE.format(ID=etl.pipeline.id) -def visualize_pipeline(etl, activities_only, filename=None): +def visualize_pipeline(etl, activities_only=False, filename=None): """Visualize the pipeline that was created Args: diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 176c46e..2c2b066 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -29,6 +29,7 @@ from ..steps import SqlCommandStep from ..steps import TransformStep from ..steps import QATransformStep +from ..steps import PrimaryKeyCheckStep from ..s3 import S3File from ..s3 import S3Path @@ -432,6 +433,9 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'extract-s3': step_class = ExtractS3Step + elif step_type == 'primary-key-check': + step_class = PrimaryKeyCheckStep + elif step_type == 'extract-local': step_class = ExtractLocalStep diff --git a/dataduct/qa/utils.py b/dataduct/qa/utils.py index 43b1865..c910bbb 100644 --- a/dataduct/qa/utils.py +++ b/dataduct/qa/utils.py @@ -5,4 +5,6 @@ def render_output(data): """Print the formatted output for the list """ - return '\n'.join(['[Dataduct]: '].extend(data)) + output = ['[Dataduct]: '] + output.extend(data) + return '\n'.join(output) diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index 6b8b81f..3626b83 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -10,3 +10,4 @@ from .sql_command import SqlCommandStep from .transform import TransformStep from .qa_transform import QATransformStep +from .primary_key_check import PrimaryKeyCheckStep diff --git a/dataduct/steps/primary_key_check.py b/dataduct/steps/primary_key_check.py new file mode 100644 index 0000000..28e631d --- /dev/null +++ b/dataduct/steps/primary_key_check.py @@ -0,0 +1,34 @@ +""" +ETL step wrapper for PK check step can be executed on Ec2 resource +""" +import os + +from .qa_transform import QATransformStep +from ..config import Config +from ..utils import constants as const +from ..utils.helpers import parse_path + +config = Config() + + +class PrimaryKeyCheckStep(QATransformStep): + """PrimaryKeyCheckStep class that checks a table for PK violations + """ + + def __init__(self, id, table_definition, **kwargs): + """Constructor for the PrimaryKeyCheckStep class + + Args: + table_definition(file): table definition for the table to check + **kwargs(optional): Keyword arguments directly passed to base class + """ + with open(parse_path(table_definition)) as f: + table_def_string = f.read() + + script_arguments = ['--table=%s' % table_def_string] + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.PK_CHECK_SCRIPT_PATH) + + super(PrimaryKeyCheckStep, self).__init__( + id=id, script=script, script_arguments=script_arguments, **kwargs) diff --git a/dataduct/steps/scripts/primary_key_test.py b/dataduct/steps/scripts/primary_key_test.py new file mode 100644 index 0000000..0d9b8d2 --- /dev/null +++ b/dataduct/steps/scripts/primary_key_test.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +"""Script that checks for primary key violations on the input table +""" + +import argparse +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.database import SqlScript +from dataduct.database import Table +from dataduct.qa import PrimaryKeyCheck + + +def main(): + """Main function + """ + parser = argparse.ArgumentParser() + + parser.add_argument('--table', dest='table', required=True) + parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) + parser.add_argument('--test_name', dest='test_name', default="Check Column") + + args = parser.parse_args() + + connection = redshift_connection() + table = Table(SqlScript(args.table)) + result = pdsql.read_sql(table.select_duplicates_script().sql(), connection) + check = PrimaryKeyCheck(len(result), name=args.test_name, + sns_topic_arn=args.sns_topic_arn) + check.publish() + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index ff923c8..ff03cf0 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -127,15 +127,8 @@ def translate_arguments(self, script_arguments): result = list() for argument in script_arguments: if isinstance(argument, dict): - argument_type = argument.get('type', - SCRIPT_ARGUMENT_TYPE_STRING) - if argument_type == SCRIPT_ARGUMENT_TYPE_SQL: - # TODO: Change to SQL Parsing - result.append(self.input_format( - argument['name'], argument['value'])) - else: - result.append(self.input_format( - argument['name'], argument['value'])) + result.extend([self.input_format(key, value) + for key, value in argument.iteritems()]) else: result.append(str(argument)) return result diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index ad46978..aa3b15b 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -25,3 +25,4 @@ SCRIPT_RUNNER_PATH = os.path.join(SCRIPTS_DIRECTORY, 'script_runner.py') DEPENDENCY_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, 'pipeline_dependency_check.py') +PK_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, 'primary_key_test.py') diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml new file mode 100644 index 0000000..c8c2218 --- /dev/null +++ b/examples/example_primary_key_check.yaml @@ -0,0 +1,9 @@ +name : example_primary_key_check +frequency : one-time +load_time: 01:00 # Hour:Min in UTC + +description : Example for the primary-key-check step + +steps: +- step_type: primary-key-check + table_definition: examples/tables/dev.test_table.sql diff --git a/examples/scripts/primary_key_test.py b/examples/scripts/primary_key_test.py deleted file mode 100644 index 8d38f22..0000000 --- a/examples/scripts/primary_key_test.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Script that checks for primary key violations on the input table -""" -#!/usr/bin/env python - -import argparse -import pandas.io.sql as pdsql -from dataduct.qa import PrimaryKeyCheck -from dataduct.data_access.connection import redshift_connection - - -def query_redshift(production, query): - """ - Input: - - prod -- whether to reference the prod table - - query -- a query that computes a count - Output: - - the value returned by the query - """ - print "Running query", query - return pdsql.read_sql(query, redshift_connection()) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - parser.add_argument('--table', dest='table', required=True) - parser.add_argument('--production', dest='production', action='store_true') - parser.add_argument('--pipeline_name', dest='pipeline_name', required=True) - - parser.add_argument( - '--sns_topic', dest='sns_topic', default=None) - parser.add_argument( - '--test_name', dest='test_name', default="Check Maestro Column") - - args = parser.parse_args() - print "Got args for check primary key", args - - table = Table(script=args.table) - result = pdsql.read_sql( - table.select_duplicates_sql().raw_sql(), redshift_connection()) - - check = PrimaryKeyCheck( - len(result), args.test_name, get_sns_alert_fn(args.sns_topic)) - check.publish(qa_check_export_fn( - args.production, args.pipeline_name, table=table.full_name)) - - print "Passed test." diff --git a/examples/tables/dev.test_table.sql b/examples/tables/dev.test_table.sql new file mode 100644 index 0000000..238486f --- /dev/null +++ b/examples/tables/dev.test_table.sql @@ -0,0 +1,4 @@ +CREATE TABLE dev.test_table( + id INTEGER PRIMARY KEY, + description VARCHAR(255) +); diff --git a/setup.py b/setup.py index 896c1c9..9c2017d 100644 --- a/setup.py +++ b/setup.py @@ -2,23 +2,14 @@ Setup file for installation of the etllib code """ from setuptools import setup +from setuptools import find_packages setup( name='dataduct', version='0.1.0', author='Coursera Inc.', - packages=[ - 'dataduct', - 'dataduct.config', - 'dataduct.data_access', - 'dataduct.database', - 'dataduct.etl', - 'dataduct.pipeline', - 'dataduct.qa', - 'dataduct.s3', - 'dataduct.steps', - 'dataduct.utils', - ], + packages=find_packages( + exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), namespace_packages=['dataduct'], include_package_data=True, url='https://github.com/coursera/dataduct', From 5d177c82ebfb209bb5b96629961a82c116f1a8fa Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Tue, 20 Jan 2015 10:28:15 -0800 Subject: [PATCH 064/175] Code review changes --- dataduct/database/parsers/select_query.py | 15 +- .../parsers/tests/test_create_table.py | 15 +- dataduct/database/select_statement.py | 4 +- dataduct/database/tests/test_database.py | 202 +++++++----------- dataduct/etl/tests/test_definition_parser.py | 9 +- 5 files changed, 99 insertions(+), 146 deletions(-) diff --git a/dataduct/database/parsers/select_query.py b/dataduct/database/parsers/select_query.py index 192a588..ee87b84 100644 --- a/dataduct/database/parsers/select_query.py +++ b/dataduct/database/parsers/select_query.py @@ -14,16 +14,15 @@ from .utils import def_field -def parse_select_base(statement): +def parse_select_base(string): """Parse a select query and return the dependencies Args: - statement(SqlStatement): Input sql statement that should be parsed + string(str): Input string to be parsed Returns: result(list of str): List of dependent tables """ - string = statement.sql() if string == '': return @@ -34,16 +33,15 @@ def parse_select_base(statement): base_parser.parseString(string) -def parse_select_dependencies(statement): +def parse_select_dependencies(string): """Parse a select query and return the dependencies Args: - statement(SqlStatement): Input sql statement that should be parsed + string(str): Input string to be parsed Returns: result(list of str): List of dependent tables """ - string = statement.sql() if string == '': return list() @@ -59,16 +57,15 @@ def parse_select_dependencies(statement): return list(set(flattened_output)) -def parse_select_columns(statement): +def parse_select_columns(string): """Parse a select query and return the columns Args: - statement(SqlStatement): Input sql statement that should be parsed + string(str): Input string to be parsed Returns: result(list of str): List of columns """ - string = statement.sql() if string == '': return list() diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py index 6a346cd..ec44739 100644 --- a/dataduct/database/parsers/tests/test_create_table.py +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -3,6 +3,7 @@ from unittest import TestCase from nose.tools import eq_ +from nose.tools import raises from pyparsing import ParseException @@ -33,27 +34,21 @@ def test_basic(): eq_(len(output['columns']), 2) @staticmethod + @raises(ParseException) def test_bad_input(): """Feeding malformed input into create table """ query = 'CREATE TABLE orders (' +\ 'customer_id INTEGER DISTKEY PRIMARY KEY' - try: - parse_create_table(query) - assert False - except ParseException: - pass + parse_create_table(query) @staticmethod + @raises(ParseException) def test_bad_input_in_columns(): """Feeding malformed input into create table """ query = 'CREATE TABLE orders (' +\ 'customer_id NEGATIVE DISTKEY PRIMARY KEY)' - try: - parse_create_table(query) - assert False - except ParseException: - pass + parse_create_table(query) diff --git a/dataduct/database/select_statement.py b/dataduct/database/select_statement.py index 8c9e37e..32ee3d0 100644 --- a/dataduct/database/select_statement.py +++ b/dataduct/database/select_statement.py @@ -16,8 +16,8 @@ def __init__(self, sql): """ super(SelectStatement, self).__init__(sql) - self._dependencies = parse_select_dependencies(self) - self._raw_columns = parse_select_columns(self) + self._dependencies = parse_select_dependencies(self.sql()) + self._raw_columns = parse_select_columns(self.sql()) self._columns = [ Column(parse_column_name(c), None) for c in self._raw_columns] diff --git a/dataduct/database/tests/test_database.py b/dataduct/database/tests/test_database.py index 5417f7a..ef65b09 100644 --- a/dataduct/database/tests/test_database.py +++ b/dataduct/database/tests/test_database.py @@ -6,6 +6,7 @@ from testfixtures import TempDirectory from nose.tools import assert_not_equal from nose.tools import eq_ +from nose.tools import raises from ..database import Database from ..table import Table @@ -29,37 +30,77 @@ def _create_view(sql): """ return View(SqlScript(sql)) + def setUp(self): + """Setup test fixtures for the database tests + """ + # A basic table and view + self.basic_table = self._create_table( + 'CREATE TABLE test_table (id INTEGER);') + self.basic_view = self._create_view( + 'CREATE VIEW test_view AS (SELECT * FROM test_table);') + + # Create tables with dependencies between them + self.first_table = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER + );""") + self.first_table_dependent = self._create_table( + """CREATE TABLE first_table ( + id1 INTEGER, + id2 INTEGER REFERENCES second_table(id2) + );""") + self.second_table = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER, + id2 INTEGER + );""") + self.second_table_dependent = self._create_table( + """CREATE TABLE second_table ( + id1 INTEGER REFERENCES first_table(id1), + id2 INTEGER + );""") + + # Create a template database to test script generation + table = self._create_table('CREATE TABLE test_table ( id INTEGER );') + view = self._create_view("""CREATE VIEW test_view AS ( + SELECT id FROM test_table + );""") + self.script_database = Database(relations=[table, view]) + def test_create(self): """Tests database initialization """ - table = self._create_table('CREATE TABLE test_begin (id INTEGER);') - database = Database(relations=[table]) + database = Database(relations=[self.basic_table]) # Verify that the database is constructed properly eq_(database.num_tables, 1) eq_(database.num_views, 0) - assert_not_equal(database.relation('test_begin'), None) + assert_not_equal(database.relation(self.basic_table.full_name), None) - @staticmethod - def test_create_from_file(): + def test_create_from_file(self): """Tests database initialization from file """ with TempDirectory() as d: # Create files in the temp directory - d.write('test_table.sql', - 'CREATE TABLE test_table (session_id INTEGER);') - d.write('test_view.sql', - 'CREATE VIEW test_view AS (SELECT * FROM test_table);') - database = Database(files=[os.path.join(d.path, 'test_table.sql'), - os.path.join(d.path, 'test_view.sql')]) + d.write(self.basic_table.full_name, + self.basic_table.sql_statement.sql()) + d.write(self.basic_view.full_name, + self.basic_view.sql_statement.sql()) + database = Database( + files=[os.path.join(d.path, self.basic_table.full_name), + os.path.join(d.path, self.basic_view.full_name)]) # Verify that the database is constructed properly eq_(database.num_tables, 1) eq_(database.num_views, 1) - assert_not_equal(database.relation('test_table'), None) - assert_not_equal(database.relation('test_view'), None) + assert_not_equal( + database.relation(self.basic_table.full_name), None) + assert_not_equal( + database.relation(self.basic_view.full_name), None) @staticmethod + @raises(ValueError) def test_create_from_file_no_relation(): """Database initialization with a file that does not create a relation @@ -68,153 +109,83 @@ def test_create_from_file_no_relation(): # Create a file in the temp directory d.write('test.sql', 'SELECT * FROM test_table;') - try: - Database(files=[os.path.join(d.path, 'test.sql')]) - assert False - except ValueError: - pass + Database(files=[os.path.join(d.path, 'test.sql')]) @staticmethod + @raises(ValueError) def test_create_two_arguments(): """Must create database with less than two arguments """ - try: - Database(relations=['test_rel'], files=['test_file']) - assert False - except ValueError: - pass + Database(relations=['test_rel'], files=['test_file']) + @raises(ValueError) def test_create_duplicate_relations(self): - """Database initalization with duplicate relations + """Database initialization with duplicate relations """ - table = self._create_table( - 'CREATE TABLE test_begin (session_id INTEGER);') - try: - Database(relations=[table, table]) - assert False - except ValueError: - pass + Database(relations=[self.basic_table, self.basic_table]) def test_database_copy(self): """Copying a database is a deepcopy """ - table = self._create_table( - 'CREATE TABLE test_begin (session_id INTEGER);') - database = Database(relations=[table]) + database = Database(relations=[self.basic_table]) database_copy = database.copy() # Check that the copied database contains the relation - assert_not_equal(database_copy.relation('test_begin'), None) + assert_not_equal( + database_copy.relation(self.basic_table.full_name), None) # Delete the relation in the copy database_copy._relations = {} # Check that the original database still contains the relation - assert_not_equal(database.relation('test_begin'), None) + assert_not_equal( + database.relation(self.basic_table.full_name), None) def test_database_has_cycles(self): """Check if a database has cycles """ - first_table = self._create_table( - """CREATE TABLE first_table ( - id1 INTEGER, - id2 INTEGER REFERENCES second_table(id2) - );""") - second_table = self._create_table( - """CREATE TABLE second_table ( - id1 INTEGER REFERENCES first_table(id1), - id2 INTEGER - );""") - - database = Database(relations=[first_table, second_table]) + database = Database(relations=[self.first_table_dependent, + self.second_table_dependent]) eq_(database.has_cycles(), True) def test_database_has_no_cycles(self): """Check if a database has no cycles """ - first_table = self._create_table( - """CREATE TABLE first_table ( - id1 INTEGER, - id2 INTEGER REFERENCES second_table(id2) - );""") - second_table = self._create_table( - """CREATE TABLE second_table ( - id1 INTEGER, - id2 INTEGER - );""") - - database = Database(relations=[first_table, second_table]) + database = Database(relations=[self.first_table_dependent, + self.second_table]) eq_(database.has_cycles(), False) def test_database_has_no_cycles_2(self): """Check if a database has no cycles """ - first_table = self._create_table( - """CREATE TABLE first_table ( - id1 INTEGER, - id2 INTEGER - );""") - second_table = self._create_table( - """CREATE TABLE second_table ( - id1 INTEGER REFERENCES first_table(id1), - id2 INTEGER - );""") - - database = Database(relations=[first_table, second_table]) + database = Database(relations=[self.first_table, + self.second_table_dependent]) eq_(database.has_cycles(), False) def test_database_sorted_relations(self): """Get the topological sort of the database """ - first_table = self._create_table( - """CREATE TABLE first_table ( - id1 INTEGER, - id2 INTEGER REFERENCES second_table(id2) - );""") - second_table = self._create_table( - """CREATE TABLE second_table ( - id1 INTEGER, - id2 INTEGER - );""") - - database = Database(relations=[first_table, second_table]) + database = Database(relations=[self.first_table_dependent, + self.second_table]) relations = database.sorted_relations() # Verify that the relations are sorted correctly eq_(len(relations), 2) - eq_(relations[0].table_name, 'second_table') - eq_(relations[1].table_name, 'first_table') + eq_(relations[0].table_name, self.second_table.full_name) + eq_(relations[1].table_name, self.first_table_dependent.full_name) + @raises(RuntimeError) def test_database_sorted_relations_cyclic(self): """Get the topological sort of the database with cycles """ - first_table = self._create_table( - """CREATE TABLE first_table ( - id1 INTEGER, - id2 INTEGER REFERENCES second_table(id2) - );""") - second_table = self._create_table( - """CREATE TABLE second_table ( - id1 INTEGER REFERENCES first_table(id1), - id2 INTEGER - );""") - - database = Database(relations=[first_table, second_table]) - try: - database.sorted_relations() - assert False - except RuntimeError: - pass + database = Database(relations=[self.first_table_dependent, + self.second_table_dependent]) + database.sorted_relations() def _test_database_scripts(self, function_name, expected_sql, **kwargs): """Generate SQL scripts with a preset database """ - table = self._create_table('CREATE TABLE test_table ( id INTEGER );') - view = self._create_view("""CREATE VIEW test_view AS ( - SELECT id FROM test_table - );""") - database = Database(relations=[table, view]) - func = getattr(database, function_name) + func = getattr(self.script_database, function_name) eq_(func(**kwargs).sql(), expected_sql) def test_database_create_relations_script(self): @@ -246,21 +217,12 @@ def test_database_recreate_relations_script(self): def test_database_recreate_table_dependencies(self): """Recreating table dependencies """ - first_table = self._create_table( - """CREATE TABLE first_table ( - id1 INTEGER, - id2 INTEGER REFERENCES second_table(id2) - );""") - second_table = self._create_table( - """CREATE TABLE second_table ( - id1 INTEGER, - id2 INTEGER - );""") view = self._create_view( """CREATE VIEW view AS ( SELECT id1 FROM second_table );""") - database = Database(relations=[first_table, second_table, view]) + database = Database(relations=[self.first_table_dependent, + self.second_table, view]) eq_(database.recreate_table_dependencies('second_table').sql(), 'ALTER TABLE first_table ADD FOREIGN KEY (id2) ' diff --git a/dataduct/etl/tests/test_definition_parser.py b/dataduct/etl/tests/test_definition_parser.py index f66ab1f..b62617e 100644 --- a/dataduct/etl/tests/test_definition_parser.py +++ b/dataduct/etl/tests/test_definition_parser.py @@ -3,6 +3,8 @@ Tests for the definition parser functions """ import unittest +from nose.tools import raises + from ..etl_actions import read_pipeline_definition from ...utils.exceptions import ETLInputError @@ -16,11 +18,8 @@ def setUp(self): """ pass + @raises(ETLInputError) def test_yaml_extension(self): """Test if the yaml extension check works correctly """ - try: - read_pipeline_definition("name.txt") - assert False - except ETLInputError: - pass + read_pipeline_definition("name.txt") From b1c7de3ba811cafb7d68d23182d930c4e2bf7dd6 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Tue, 20 Jan 2015 13:27:18 -0800 Subject: [PATCH 065/175] More code review changes --- .../parsers/tests/test_create_table.py | 3 -- .../database/sql/tests/test_sql_script.py | 2 -- dataduct/database/tests/test_database.py | 36 +++++++++---------- dataduct/etl/tests/test_definition_parser.py | 5 --- 4 files changed, 17 insertions(+), 29 deletions(-) diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py index ec44739..fe536c5 100644 --- a/dataduct/database/parsers/tests/test_create_table.py +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -4,7 +4,6 @@ from unittest import TestCase from nose.tools import eq_ from nose.tools import raises - from pyparsing import ParseException from ..create_table import parse_create_table @@ -40,7 +39,6 @@ def test_bad_input(): """ query = 'CREATE TABLE orders (' +\ 'customer_id INTEGER DISTKEY PRIMARY KEY' - parse_create_table(query) @staticmethod @@ -50,5 +48,4 @@ def test_bad_input_in_columns(): """ query = 'CREATE TABLE orders (' +\ 'customer_id NEGATIVE DISTKEY PRIMARY KEY)' - parse_create_table(query) diff --git a/dataduct/database/sql/tests/test_sql_script.py b/dataduct/database/sql/tests/test_sql_script.py index 8b1eeef..dd6121f 100644 --- a/dataduct/database/sql/tests/test_sql_script.py +++ b/dataduct/database/sql/tests/test_sql_script.py @@ -126,7 +126,6 @@ def test_creates_table_failure_not_first_statement(): SELECT * FROM test_begin; CREATE TABLE test_begin (session_id INTEGER); """) - eq_(script.creates_table(), False) @staticmethod @@ -161,7 +160,6 @@ def test_creates_view_failure_not_first_statement(): SELECT * FROM test_begin; CREATE VIEW test_begin AS (SELECT * FROM test_table); """) - eq_(script.creates_view(), False) @staticmethod diff --git a/dataduct/database/tests/test_database.py b/dataduct/database/tests/test_database.py index ef65b09..2fad6c1 100644 --- a/dataduct/database/tests/test_database.py +++ b/dataduct/database/tests/test_database.py @@ -191,28 +191,26 @@ def _test_database_scripts(self, function_name, expected_sql, **kwargs): def test_database_create_relations_script(self): """Creating relations in the database """ - self._test_database_scripts( - 'create_relations_script', - 'CREATE TABLE test_table ( id INTEGER );\n' - 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') + + result = ('CREATE TABLE test_table ( id INTEGER );\n' + 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') + self._test_database_scripts('create_relations_script', result) def test_database_drop_relations_script(self): """Dropping relations in the database """ - self._test_database_scripts( - 'drop_relations_script', - 'DROP TABLE IF EXISTS test_table CASCADE;\n' - 'DROP VIEW IF EXISTS test_view CASCADE;') + result = ('DROP TABLE IF EXISTS test_table CASCADE;\n' + 'DROP VIEW IF EXISTS test_view CASCADE;') + self._test_database_scripts('drop_relations_script', result) def test_database_recreate_relations_script(self): """Recreating relations in the database """ - self._test_database_scripts( - 'recreate_relations_script', - 'DROP TABLE IF EXISTS test_table CASCADE;\n' - 'CREATE TABLE test_table ( id INTEGER );\n' - 'DROP VIEW IF EXISTS test_view CASCADE;\n' - 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') + result = ('DROP TABLE IF EXISTS test_table CASCADE;\n' + 'CREATE TABLE test_table ( id INTEGER );\n' + 'DROP VIEW IF EXISTS test_view CASCADE;\n' + 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') + self._test_database_scripts('recreate_relations_script', result) def test_database_recreate_table_dependencies(self): """Recreating table dependencies @@ -224,9 +222,9 @@ def test_database_recreate_table_dependencies(self): database = Database(relations=[self.first_table_dependent, self.second_table, view]) - eq_(database.recreate_table_dependencies('second_table').sql(), - 'ALTER TABLE first_table ADD FOREIGN KEY (id2) ' - 'REFERENCES second_table (id2);\n' - 'DROP VIEW IF EXISTS view CASCADE;\n' - 'CREATE VIEW view AS ( SELECT id1 FROM second_table );') + result = ('ALTER TABLE first_table ADD FOREIGN KEY (id2) ' + 'REFERENCES second_table (id2);\n' + 'DROP VIEW IF EXISTS view CASCADE;\n' + 'CREATE VIEW view AS ( SELECT id1 FROM second_table );') + eq_(database.recreate_table_dependencies('second_table').sql(), result) eq_(database.recreate_table_dependencies('first_table').sql(), ';') diff --git a/dataduct/etl/tests/test_definition_parser.py b/dataduct/etl/tests/test_definition_parser.py index b62617e..439d587 100644 --- a/dataduct/etl/tests/test_definition_parser.py +++ b/dataduct/etl/tests/test_definition_parser.py @@ -13,11 +13,6 @@ class DefinitionParserTests(unittest.TestCase): """Tests for the definition parser. """ - def setUp(self): - """Fixtures for the definition test - """ - pass - @raises(ETLInputError) def test_yaml_extension(self): """Test if the yaml extension check works correctly From d826af4b0c98e4d9ec2dc4fc7a92c80109753327 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 21 Jan 2015 11:07:31 -0800 Subject: [PATCH 066/175] Added History Table Class, bug fixes --- dataduct/database/database.py | 2 +- dataduct/database/history_table.py | 202 ++++++++++++++++++ dataduct/database/relation.py | 2 +- dataduct/database/select_statement.py | 1 - dataduct/database/table.py | 43 ++-- dataduct/database/tests/test_history_table.py | 104 +++++++++ 6 files changed, 336 insertions(+), 18 deletions(-) create mode 100644 dataduct/database/history_table.py create mode 100644 dataduct/database/tests/test_history_table.py diff --git a/dataduct/database/database.py b/dataduct/database/database.py index aa64d78..b6e2d56 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -199,7 +199,7 @@ def _make_node_label(relation): """ columns = list() row = '{col_name}{pk}' - for column in sorted(relation.columns, key=lambda x: x.position): + for column in sorted(relation.columns(), key=lambda x: x.position): columns.append(row.format(col_name=column.name, pk=' (PK)' if column.primary else '')) diff --git a/dataduct/database/history_table.py b/dataduct/database/history_table.py new file mode 100644 index 0000000..4d1203f --- /dev/null +++ b/dataduct/database/history_table.py @@ -0,0 +1,202 @@ +"""Script containing the history table class object +Child of the table class object +""" + +from .table import Table +from .sql import SqlScript +from .select_statement import SelectStatement + +HIST_EFFECTIVE_COLUMN = 'effective_ts' +HIST_EXPIRATION_COLUMN = 'expiration_ts' +HIST_EXPIRATION_MAX = '9999-12-31 23:59:59.999999' + + +class HistoryTable(Table): + """A history table is a table specifically designed to represent + Slowly Changing Dimensions + (http://en.wikipedia.org/wiki/Slowly_changing_dimension). + + Its first two columns must be an effective timestamp and an expiration + timestamp, but otherwise it looks just like a regular table. + """ + + def __init__(self, sql): + """Constructor for the HistoryTable class + """ + super(HistoryTable, self).__init__(sql) + # Check that first column is the effective timestamp + # And the second column is the expiration timestamp + if self.column(HIST_EFFECTIVE_COLUMN) is None or\ + self.column(HIST_EXPIRATION_COLUMN) is None: + raise ValueError('History table must have effective and expiration' + ' timestamps') + + def _select_current_script(self): + """SQL script to select current view of table + """ + + # Get all columns except for the two timestamps + selected_columns = [c.name for c in self.columns() + if c.name != HIST_EFFECTIVE_COLUMN and + c.name != HIST_EXPIRATION_COLUMN] + + return SelectStatement(""" + SELECT {selected_columns} + FROM {history_name} + WHERE {expiration_column} = \'{expiration_max}\' + """.format(selected_columns=', '.join(selected_columns), + history_name=self.full_name, + expiration_column=HIST_EXPIRATION_COLUMN, + expiration_max=HIST_EXPIRATION_MAX)) + + def _expire_history_script(self, source): + """SQL script to expire outdated records + + Args: + source (Table): The source from which to update history + + Returns: + SqlScript: a SQL statement that removes outdated records + + A history row will be expired if: + It is currently unexpired (expiration timestamp is at max); and + either: + It's corresponding row in the source table has been changed; or + It's corresponding row in the source table has been deleted. + """ + + if not isinstance(source, Table): + raise ValueError('Source must be a table') + + # Get the secondary columns of the table + secondary_columns = [column for column in source.columns() + if not column.primary] + + # There must be at least one primary and secondary key + if len(source.primary_keys) == 0: + raise ValueError('Source table must have a primary key') + if len(secondary_columns) == 0: + raise ValueError('Source table must have a non-primary column') + + # Expire if corresponding row in the source table has been changed + # First, match primary key info to determine corresponding rows + same_statement =\ + '{history_name}.{column_name} = {source_name}.{column_name}' + matching_primary_keys_condition = ' AND '.join( + [same_statement.format(history_name=self.full_name, + source_name=source.full_name, + column_name=column.name) + for column in source.primary_keys] + ) + # Then, filter to get only the records that have changed + # A record has been changed if one of it's non-primary columns + # are different + different_statement = """ + {history_name}.{column_name} != {source_name}.{column_name} + OR ( + {history_name}.{column_name} IS NULL + AND {source_name}.{column_name} IS NOT NULL + ) + OR ( + {history_name}.{column_name} IS NOT NULL + AND {source_name}.{column_name} IS NULL + ) + """ + record_changed_condition = '(' + ' OR '.join( + [different_statement.format(history_name=self.full_name, + source_name=source.full_name, + column_name=column.name) + for column in secondary_columns] + ) + ')' + # Lastly, filter to get only the non-expired columns + # This statement will be reused for the removal check + not_expired_condition =\ + '{expiration_column} = \'{expiration_max}\''.format( + expiration_column=HIST_EXPIRATION_COLUMN, + expiration_max=HIST_EXPIRATION_MAX, + ) + # Expire changed columns + script = SqlScript(""" + UPDATE {history_name} + SET {expiration_column} = SYSDATE - INTERVAL '0.000001 seconds' + FROM {source_name} + WHERE {matching_primary_keys} + AND {record_changed} + AND {not_expired}; + """.format(history_name=self.full_name, + expiration_column=HIST_EXPIRATION_COLUMN, + source_name=source.full_name, + matching_primary_keys=matching_primary_keys_condition, + record_changed=record_changed_condition, + not_expired=not_expired_condition)) + + # Expire if corresponding row in the source table has been deleted + # Filter to get the history rows which have primary keys + # that are no longer in the source table + primary_keys = ",".join([name for name in source.primary_key_names]) + missing_primary_keys_condition = """ + ( + {primary_keys} + ) + NOT IN ( + SELECT {primary_keys} + FROM {source_name} + ) + """.format(primary_keys=primary_keys, + source_name=source.full_name) + + script.append(""" + UPDATE {history_name} + SET {expiration_column} = SYSDATE - INTERVAL '0.000001 seconds' + WHERE {missing_primary_keys} + AND {not_expired}; + """.format(history_name=self.full_name, + expiration_column=HIST_EXPIRATION_COLUMN, + missing_primary_keys=missing_primary_keys_condition, + not_expired=not_expired_condition)) + return script + + def update_history_script(self, source): + """SQL script to update the history table + + Args: + source (Table): The source from which to update history + + Returns: + SqlScript: a SQL statement that updates history + + Raises: + ValueError: If source is not a Table object + """ + + if not isinstance(source, Table): + raise ValueError('Source must be a table') + + # Create a temporary copy of the source relation as another table + temp_table = Table(source.temporary_clone_script()) + result = temp_table.create_script() + + # Insert the values of the original table into the temp table + result.append(temp_table.insert_script(source)) + + # Expire outdated records + result.append(self._expire_history_script(source)) + + # Delete records from the temp table that have not changed + result.append( + temp_table.delete_matching_rows_script( + self._select_current_script())) + + # Insert the remaining rows into destination + select_statement = SelectStatement(""" + SELECT SYSDATE, \'{expiration_max}\'::TIMESTAMP, {columns} + FROM {temp_table_name} + """.format(expiration_max=HIST_EXPIRATION_MAX, + columns=', '.join( + [c.name for c in temp_table.columns()]), + temp_table_name=temp_table.full_name)) + result.append(self.insert_script(select_statement)) + + # Drop the temp table, in case the temporary flag isn't enough + result.append(temp_table.drop_script()) + return result diff --git a/dataduct/database/relation.py b/dataduct/database/relation.py index f8066b6..73da59a 100644 --- a/dataduct/database/relation.py +++ b/dataduct/database/relation.py @@ -13,7 +13,7 @@ class Relation(object): def __str__(self): """Output for the print statement of the relation """ - return self.sql_statement + return self.sql_statement.sql() def copy(self): """Create a copy of the relation object diff --git a/dataduct/database/select_statement.py b/dataduct/database/select_statement.py index 32ee3d0..6f1d11a 100644 --- a/dataduct/database/select_statement.py +++ b/dataduct/database/select_statement.py @@ -27,7 +27,6 @@ def dependencies(self): """ return self._dependencies - @property def columns(self): """Table columns of the select statement """ diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 6a81e37..aeb28a2 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -71,29 +71,33 @@ def update_columns_with_constrains(self): for col_name in constraint.get('pk_columns', list()): self._columns[col_name].primary = True - @property def columns(self): - """Columns for the table + """Unsorted list of columns in the table """ return self._columns.values() + def column(self, column_name): + """Get the column with the given name + """ + return self._columns.get(column_name, None) + @property def primary_keys(self): """Primary keys of the table """ - return [c for c in self.columns if c.primary] + return [c for c in self.columns() if c.primary] @property def primary_key_names(self): """Primary keys of the table """ - return [c.name for c in self.columns if c.primary] + return [c.name for c in self.columns() if c.primary] def foreign_key_references(self): """Get a list of all foreign key references from the table """ result = list() - for column in self.columns: + for column in self.columns(): if column.fk_table is not None: result.append(( [column.name], column.fk_table, [column.fk_reference])) @@ -123,10 +127,20 @@ def temporary_clone_script(self): table_name = self.table_name + '_temp' # Create a list of column definitions - columns = comma_seperated( - ['%s %s' %(c.column_name, c.column_type) for c in self.columns]) + # We need to keep primary key constraints on the temp table + column_template = '{column_name} {column_type} {primary_text}' + columns = [] + for column in self.columns(): + primary_text = '' + if column.primary: + primary_text = 'PRIMARY KEY' + columns.append( + column_template.format(column_name=column.column_name, + column_type=column.column_type, + primary_text=primary_text)) + + columns = comma_seperated(columns) - # We don't need any constraints to be specified on the temp table sql = ['CREATE TEMPORARY TABLE %s ( %s )' % (table_name, columns)] return SqlScript(sql) @@ -185,11 +199,11 @@ def select_duplicates_script(self): def _source_sql(self, source_relation): """Get the source sql based on the type of the source specified """ - if not (isinstance(source_relation, Relation) or \ + if not (isinstance(source_relation, Relation) or isinstance(source_relation, SelectStatement)): raise ValueError('Source Relation must be a relation or select') - if len(self.columns) < len(source_relation.columns): + if len(self.columns()) < len(source_relation.columns()): raise ValueError('Source has more columns than destination') if isinstance(source_relation, SelectStatement): @@ -202,7 +216,7 @@ def _source_sql(self, source_relation): def insert_script(self, source_relation): """Sql Script to insert into the table while avoiding PK violations """ - sql = 'INSERT INTO %s (SELECT * FROM %s)' %( + sql = 'INSERT INTO %s (SELECT * FROM %s)' % ( self.full_name, self._source_sql(source_relation)) return SqlScript(sql) @@ -214,11 +228,10 @@ def delete_matching_rows_script(self, source_relation): 'Cannot delete matching rows from table with no primary keys') source_col_names, pk_names = [], [] - source_columns = source_relation.columns - for i, column in enumerate(self.columns): + for column in self.columns(): if column.primary: pk_names.append(column.name) - source_col_names.append(source_columns[i].name) + source_col_names.append(column.name) where_condition = 'WHERE (%s) IN (SELECT DISTINCT %s FROM %s)' % ( comma_seperated(pk_names), comma_seperated(source_col_names), @@ -234,7 +247,7 @@ def de_duplication_script(self): 'Cannot de-duplicate table with no primary keys') script = self.temporary_clone_script() - column_names = [c.name for c in self.columns] + column_names = [c.name for c in self.columns()] # Create a temporary clone from the script temp_table = self.__class__(script) diff --git a/dataduct/database/tests/test_history_table.py b/dataduct/database/tests/test_history_table.py new file mode 100644 index 0000000..b1d0fa8 --- /dev/null +++ b/dataduct/database/tests/test_history_table.py @@ -0,0 +1,104 @@ +"""Tests for the HistoryTable class +""" +from unittest import TestCase +from nose.tools import raises +from nose.tools import eq_ + +from ..sql.sql_script import SqlScript +from ..table import Table +from ..history_table import HistoryTable + + +class TestHistoryTable(TestCase): + """Tests for the HistoryTable class + """ + + @staticmethod + def _create_history_table(sql): + return HistoryTable(SqlScript(sql)) + + @staticmethod + def _create_table(sql): + return Table(SqlScript(sql)) + + def setUp(self): + """Setup test fixtures + """ + self.basic_table = self._create_table( + """CREATE TABLE test_table ( + id INTEGER PRIMARY KEY, + value VARCHAR(25) + );""") + self.basic_history_table = self._create_history_table( + """CREATE TABLE test_history_table ( + effective_ts TIMESTAMP, + expiration_ts TIMESTAMP, + id INTEGER, + value VARCHAR(25) + );""") + + @raises(ValueError) + def test_create_history_table_no_timestamps(self): + """Tests if creating a history table with no timestamps + returns an error + """ + self._create_history_table('CREATE TABLE test_table ( id INTEGER );') + + def test_history_script(self): + """Diff comparison of generated SQL script + """ + sql = ( + 'CREATE TEMPORARY TABLE test_table_temp ( ' + 'id INTEGER PRIMARY KEY,' + 'value VARCHAR(25) ' + ');\n' + 'INSERT INTO test_table_temp (SELECT * FROM test_table);\n' + + 'UPDATE test_history_table ' + 'SET expiration_ts = SYSDATE - INTERVAL \'0.000001 seconds\' ' + 'FROM test_table ' + 'WHERE test_history_table.id = test_table.id ' + 'AND ( ' + 'test_history_table.value != test_table.value ' + 'OR ( ' + 'test_history_table.value IS NULL ' + 'AND test_table.value IS NOT NULL ' + ') ' + 'OR ( ' + 'test_history_table.value IS NOT NULL ' + 'AND test_table.value IS NULL ' + ') ' + ') ' + 'AND expiration_ts = \'9999-12-31 23:59:59.999999\';\n' + + 'UPDATE test_history_table ' + 'SET expiration_ts = SYSDATE - INTERVAL \'0.000001 seconds\' ' + 'WHERE ( id ) NOT IN ( ' + 'SELECT id ' + 'FROM test_table ' + ') ' + 'AND expiration_ts = \'9999-12-31 23:59:59.999999\';\n' + + 'DELETE FROM test_table_temp ' + 'WHERE (id) IN (' + 'SELECT DISTINCT id ' + 'FROM (' + 'SELECT id, value ' + 'FROM test_history_table ' + 'WHERE expiration_ts = \'9999-12-31 23:59:59.999999\'' + ')' + ');\n' + + 'INSERT INTO test_history_table (' + 'SELECT * FROM (' + 'SELECT SYSDATE, ' + '\'9999-12-31 23:59:59.999999\'::TIMESTAMP, ' + 'id, ' + 'value ' + 'FROM test_table_temp' + ')' + ');\n' + + 'DROP TABLE IF EXISTS test_table_temp CASCADE;') + eq_(self.basic_history_table.update_history_script( + self.basic_table).sql(), sql) From cce2453d3f198a54f54823430713d3a253796c18 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 22 Jan 2015 09:34:09 -0800 Subject: [PATCH 067/175] Code review changes --- dataduct/database/history_table.py | 4 ++-- dataduct/database/table.py | 24 +++++++------------ dataduct/database/tests/test_history_table.py | 5 ++-- 3 files changed, 14 insertions(+), 19 deletions(-) diff --git a/dataduct/database/history_table.py b/dataduct/database/history_table.py index 4d1203f..32589c6 100644 --- a/dataduct/database/history_table.py +++ b/dataduct/database/history_table.py @@ -43,7 +43,7 @@ def _select_current_script(self): return SelectStatement(""" SELECT {selected_columns} FROM {history_name} - WHERE {expiration_column} = \'{expiration_max}\' + WHERE {expiration_column} = '{expiration_max}' """.format(selected_columns=', '.join(selected_columns), history_name=self.full_name, expiration_column=HIST_EXPIRATION_COLUMN, @@ -189,7 +189,7 @@ def update_history_script(self, source): # Insert the remaining rows into destination select_statement = SelectStatement(""" - SELECT SYSDATE, \'{expiration_max}\'::TIMESTAMP, {columns} + SELECT SYSDATE, '{expiration_max}'::TIMESTAMP, {columns} FROM {temp_table_name} """.format(expiration_max=HIST_EXPIRATION_MAX, columns=', '.join( diff --git a/dataduct/database/table.py b/dataduct/database/table.py index aeb28a2..17321cd 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -127,21 +127,15 @@ def temporary_clone_script(self): table_name = self.table_name + '_temp' # Create a list of column definitions - # We need to keep primary key constraints on the temp table - column_template = '{column_name} {column_type} {primary_text}' - columns = [] - for column in self.columns(): - primary_text = '' - if column.primary: - primary_text = 'PRIMARY KEY' - columns.append( - column_template.format(column_name=column.column_name, - column_type=column.column_type, - primary_text=primary_text)) - - columns = comma_seperated(columns) - - sql = ['CREATE TEMPORARY TABLE %s ( %s )' % (table_name, columns)] + columns = comma_seperated( + ['%s %s' % (c.column_name, c.column_type) for c in self.columns()]) + + sql = """CREATE TEMPORARY TABLE {table_name} ( + {columns}, + PRIMARY KEY( {primary_keys} ) + )""".format(table_name=table_name, + columns=columns, + primary_keys=comma_seperated(self.primary_key_names)) return SqlScript(sql) diff --git a/dataduct/database/tests/test_history_table.py b/dataduct/database/tests/test_history_table.py index b1d0fa8..d524390 100644 --- a/dataduct/database/tests/test_history_table.py +++ b/dataduct/database/tests/test_history_table.py @@ -49,8 +49,9 @@ def test_history_script(self): """ sql = ( 'CREATE TEMPORARY TABLE test_table_temp ( ' - 'id INTEGER PRIMARY KEY,' - 'value VARCHAR(25) ' + 'id INTEGER,' + 'value VARCHAR(25), ' + 'PRIMARY KEY( id ) ' ');\n' 'INSERT INTO test_table_temp (SELECT * FROM test_table);\n' From f5931b5f8818ce0208d378b9482af022c56949ee Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 22 Jan 2015 10:46:05 -0800 Subject: [PATCH 068/175] Changed unit tests as per code review --- dataduct/database/tests/test_database.py | 51 +++++++++++-------- dataduct/database/tests/test_history_table.py | 34 ++++++++----- 2 files changed, 50 insertions(+), 35 deletions(-) diff --git a/dataduct/database/tests/test_database.py b/dataduct/database/tests/test_database.py index 2fad6c1..0534247 100644 --- a/dataduct/database/tests/test_database.py +++ b/dataduct/database/tests/test_database.py @@ -182,35 +182,42 @@ def test_database_sorted_relations_cyclic(self): self.second_table_dependent]) database.sorted_relations() - def _test_database_scripts(self, function_name, expected_sql, **kwargs): - """Generate SQL scripts with a preset database + @staticmethod + def _compare_scripts(actual_script, expected_script): + """Validates a SqlScript chain """ - func = getattr(self.script_database, function_name) - eq_(func(**kwargs).sql(), expected_sql) + assert(len(actual_script), len(expected_script)) + for actual, expected in zip(actual_script, expected_script): + eq_(actual.sql(), expected) def test_database_create_relations_script(self): """Creating relations in the database """ - - result = ('CREATE TABLE test_table ( id INTEGER );\n' - 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') - self._test_database_scripts('create_relations_script', result) + result = ['CREATE TABLE test_table ( id INTEGER )', + 'CREATE VIEW test_view AS ( SELECT id FROM test_table )'] + self._compare_scripts( + self.script_database.create_relations_script(), + result) def test_database_drop_relations_script(self): """Dropping relations in the database """ - result = ('DROP TABLE IF EXISTS test_table CASCADE;\n' - 'DROP VIEW IF EXISTS test_view CASCADE;') - self._test_database_scripts('drop_relations_script', result) + result = ['DROP TABLE IF EXISTS test_table CASCADE', + 'DROP VIEW IF EXISTS test_view CASCADE'] + self._compare_scripts( + self.script_database.drop_relations_script(), + result) def test_database_recreate_relations_script(self): """Recreating relations in the database """ - result = ('DROP TABLE IF EXISTS test_table CASCADE;\n' - 'CREATE TABLE test_table ( id INTEGER );\n' - 'DROP VIEW IF EXISTS test_view CASCADE;\n' - 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') - self._test_database_scripts('recreate_relations_script', result) + result = ['DROP TABLE IF EXISTS test_table CASCADE', + 'CREATE TABLE test_table ( id INTEGER )', + 'DROP VIEW IF EXISTS test_view CASCADE', + 'CREATE VIEW test_view AS ( SELECT id FROM test_table )'] + self._compare_scripts( + self.script_database.recreate_relations_script(), + result) def test_database_recreate_table_dependencies(self): """Recreating table dependencies @@ -222,9 +229,11 @@ def test_database_recreate_table_dependencies(self): database = Database(relations=[self.first_table_dependent, self.second_table, view]) - result = ('ALTER TABLE first_table ADD FOREIGN KEY (id2) ' - 'REFERENCES second_table (id2);\n' - 'DROP VIEW IF EXISTS view CASCADE;\n' - 'CREATE VIEW view AS ( SELECT id1 FROM second_table );') - eq_(database.recreate_table_dependencies('second_table').sql(), result) + result = ['ALTER TABLE first_table ADD FOREIGN KEY (id2) ' + 'REFERENCES second_table (id2)', + 'DROP VIEW IF EXISTS view CASCADE', + 'CREATE VIEW view AS ( SELECT id1 FROM second_table )'] + self._compare_scripts( + database.recreate_table_dependencies('second_table'), + result) eq_(database.recreate_table_dependencies('first_table').sql(), ';') diff --git a/dataduct/database/tests/test_history_table.py b/dataduct/database/tests/test_history_table.py index d524390..9a1c6be 100644 --- a/dataduct/database/tests/test_history_table.py +++ b/dataduct/database/tests/test_history_table.py @@ -47,14 +47,16 @@ def test_create_history_table_no_timestamps(self): def test_history_script(self): """Diff comparison of generated SQL script """ - sql = ( + expected_script = [ + # Create temp table 'CREATE TEMPORARY TABLE test_table_temp ( ' 'id INTEGER,' 'value VARCHAR(25), ' 'PRIMARY KEY( id ) ' - ');\n' - 'INSERT INTO test_table_temp (SELECT * FROM test_table);\n' - + ')', + # Update temp table with source table's entries + 'INSERT INTO test_table_temp (SELECT * FROM test_table)', + # Expire updated rows 'UPDATE test_history_table ' 'SET expiration_ts = SYSDATE - INTERVAL \'0.000001 seconds\' ' 'FROM test_table ' @@ -70,16 +72,16 @@ def test_history_script(self): 'AND test_table.value IS NULL ' ') ' ') ' - 'AND expiration_ts = \'9999-12-31 23:59:59.999999\';\n' - + 'AND expiration_ts = \'9999-12-31 23:59:59.999999\'', + # Expire deleted rows 'UPDATE test_history_table ' 'SET expiration_ts = SYSDATE - INTERVAL \'0.000001 seconds\' ' 'WHERE ( id ) NOT IN ( ' 'SELECT id ' 'FROM test_table ' ') ' - 'AND expiration_ts = \'9999-12-31 23:59:59.999999\';\n' - + 'AND expiration_ts = \'9999-12-31 23:59:59.999999\'', + # Delete updated rows from temp table 'DELETE FROM test_table_temp ' 'WHERE (id) IN (' 'SELECT DISTINCT id ' @@ -88,8 +90,8 @@ def test_history_script(self): 'FROM test_history_table ' 'WHERE expiration_ts = \'9999-12-31 23:59:59.999999\'' ')' - ');\n' - + ')', + # Copy temp table rows into source table 'INSERT INTO test_history_table (' 'SELECT * FROM (' 'SELECT SYSDATE, ' @@ -98,8 +100,12 @@ def test_history_script(self): 'value ' 'FROM test_table_temp' ')' - ');\n' + ')', + # Drop temp table + 'DROP TABLE IF EXISTS test_table_temp CASCADE'] - 'DROP TABLE IF EXISTS test_table_temp CASCADE;') - eq_(self.basic_history_table.update_history_script( - self.basic_table).sql(), sql) + actual_script = self.basic_history_table.update_history_script( + self.basic_table) + eq_(len(actual_script), len(expected_script)) + for actual, expected in zip(actual_script, expected_script): + eq_(actual.sql(), expected) From 932653ef49eb48096f45214c5c59b1c3bbeee639 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 22 Jan 2015 14:45:10 -0800 Subject: [PATCH 069/175] Add Column Check step --- dataduct/etl/etl_pipeline.py | 4 ++ dataduct/steps/__init__.py | 1 + dataduct/steps/column_check.py | 42 ++++++++++++++ dataduct/steps/scripts/column_check_test.py | 62 +++++++++++++++++++++ dataduct/steps/scripts/primary_key_test.py | 3 +- dataduct/utils/constants.py | 2 + examples/example_column_check.yaml | 10 ++++ examples/tables/dev.test_table2.sql | 4 ++ 8 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 dataduct/steps/column_check.py create mode 100644 dataduct/steps/scripts/column_check_test.py create mode 100644 examples/example_column_check.yaml create mode 100644 examples/tables/dev.test_table2.sql diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 2c2b066..8c52d9d 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -30,6 +30,7 @@ from ..steps import TransformStep from ..steps import QATransformStep from ..steps import PrimaryKeyCheckStep +from ..steps import ColumnCheckStep from ..s3 import S3File from ..s3 import S3Path @@ -436,6 +437,9 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'primary-key-check': step_class = PrimaryKeyCheckStep + elif step_type == 'column-check': + step_class = ColumnCheckStep + elif step_type == 'extract-local': step_class = ExtractLocalStep diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index 3626b83..1b7d77d 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -11,3 +11,4 @@ from .transform import TransformStep from .qa_transform import QATransformStep from .primary_key_check import PrimaryKeyCheckStep +from .column_check import ColumnCheckStep diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py new file mode 100644 index 0000000..3cdc936 --- /dev/null +++ b/dataduct/steps/column_check.py @@ -0,0 +1,42 @@ +"""ETL step wrapper for column check step can be executed on Ec2 resource +""" +import os + +from .qa_transform import QATransformStep +from ..config import Config +from ..utils import constants as const +from ..utils.helpers import parse_path + +config = Config() + + +class ColumnCheckStep(QATransformStep): + """ColumnCheckStep class that checks if the rows of a table has been + populated with the correct values + """ + + def __init__(self, id, source_table_definition, + destination_table_definition, **kwargs): + """Constructor for the ColumnCheckStep class + + Args: + source_table_definition(file): + table definition for the source table + destination_table_definition(file): + table definition for the destination table + **kwargs(optional): Keyword arguments directly passed to base class + """ + with open(parse_path(source_table_definition)) as f: + source_table_string = f.read() + with open(parse_path(destination_table_definition)) as f: + destination_table_string = f.read() + + script_arguments = ['--source_table=%s' % source_table_string, + '--destination_table=%s' + % destination_table_string] + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.COLUMN_CHECK_SCRIPT_PATH) + + super(ColumnCheckStep, self).__init__( + id=id, script=script, script_arguments=script_arguments, **kwargs) diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py new file mode 100644 index 0000000..d76e9a6 --- /dev/null +++ b/dataduct/steps/scripts/column_check_test.py @@ -0,0 +1,62 @@ +#!/usr/bin/env python + +"""Script that checks if the rows of the destination table has been populated +with the correct values +""" + +import argparse +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.database import SqlScript +from dataduct.database import Table +from dataduct.qa import ColumnCheck + + +def _get_data(sql, connection): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + connection(Connection): A connection to the database + + Returns: + DataFrame: The rows of the table + """ + table = Table(SqlScript(sql)) + return pdsql.read_sql(table.select_script().sql(), + connection, + index_col=table.primary_key_names) + + +def main(): + """Main function + + Args (taken in through argparse): + source_table: SQL script of the source table + destination_table: SQL script of the destination table + """ + parser = argparse.ArgumentParser() + + parser.add_argument('--source_table', dest='source_table', required=True) + parser.add_argument('--destination_table', dest='destination_table', + required=True) + parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) + parser.add_argument('--test_name', dest='test_name', + default='Check Column') + + args = parser.parse_args() + + # Open up a connection and read the source and destination tables + connection = redshift_connection() + source_data = _get_data(args.source_table, connection) + destination_data = _get_data(args.destination_table, connection) + + check = ColumnCheck(source_data, destination_data, name=args.test_name, + sns_topic_arn=args.sns_topic_arn) + check.publish() + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/scripts/primary_key_test.py b/dataduct/steps/scripts/primary_key_test.py index 0d9b8d2..cb36d67 100644 --- a/dataduct/steps/scripts/primary_key_test.py +++ b/dataduct/steps/scripts/primary_key_test.py @@ -18,7 +18,8 @@ def main(): parser.add_argument('--table', dest='table', required=True) parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) - parser.add_argument('--test_name', dest='test_name', default="Check Column") + parser.add_argument('--test_name', dest='test_name', + default="Check Primary Key") args = parser.parse_args() diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index aa3b15b..dac8218 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -26,3 +26,5 @@ DEPENDENCY_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, 'pipeline_dependency_check.py') PK_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, 'primary_key_test.py') +COLUMN_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, + 'column_check_test.py') diff --git a/examples/example_column_check.yaml b/examples/example_column_check.yaml new file mode 100644 index 0000000..f57f33c --- /dev/null +++ b/examples/example_column_check.yaml @@ -0,0 +1,10 @@ +name : example_column_check +frequency : one-time +load_time: 01:00 + +description : Example for the column-check step + +steps: +- step_type: column-check + source_table_definition: examples/tables/dev.test_table.sql + destination_table_definition: examples/tables/dev.test_table2.sql diff --git a/examples/tables/dev.test_table2.sql b/examples/tables/dev.test_table2.sql new file mode 100644 index 0000000..37fe006 --- /dev/null +++ b/examples/tables/dev.test_table2.sql @@ -0,0 +1,4 @@ +CREATE TABLE dev.test_table2( + id INTEGER PRIMARY KEY, + description VARCHAR(255) +); From 6cf90bc93b3ed94247b85f28e2b19507281bdf6a Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 22 Jan 2015 17:56:03 -0800 Subject: [PATCH 070/175] Add Count Check step --- dataduct/etl/etl_pipeline.py | 4 ++ dataduct/steps/__init__.py | 1 + dataduct/steps/count_check.py | 39 +++++++++++++++++ dataduct/steps/scripts/count_check_test.py | 49 ++++++++++++++++++++++ dataduct/utils/constants.py | 2 + examples/example_count_check.yaml | 10 +++++ examples/tables/dev.test_table2.sql | 4 ++ 7 files changed, 109 insertions(+) create mode 100644 dataduct/steps/count_check.py create mode 100644 dataduct/steps/scripts/count_check_test.py create mode 100644 examples/example_count_check.yaml create mode 100644 examples/tables/dev.test_table2.sql diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 2c2b066..c14c775 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -30,6 +30,7 @@ from ..steps import TransformStep from ..steps import QATransformStep from ..steps import PrimaryKeyCheckStep +from ..steps import CountCheckStep from ..s3 import S3File from ..s3 import S3Path @@ -436,6 +437,9 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'primary-key-check': step_class = PrimaryKeyCheckStep + elif step_type == 'count-check': + step_class = CountCheckStep + elif step_type == 'extract-local': step_class = ExtractLocalStep diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index 3626b83..9ffaaaa 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -11,3 +11,4 @@ from .transform import TransformStep from .qa_transform import QATransformStep from .primary_key_check import PrimaryKeyCheckStep +from .count_check import CountCheckStep diff --git a/dataduct/steps/count_check.py b/dataduct/steps/count_check.py new file mode 100644 index 0000000..812d20b --- /dev/null +++ b/dataduct/steps/count_check.py @@ -0,0 +1,39 @@ +"""ETL step wrapper for count check step can be executed on the Ec2 resource +""" +import os + +from .qa_transform import QATransformStep +from ..config import Config +from ..utils import constants as const +from ..utils.helpers import parse_path + +config = Config() + + +class CountCheckStep(QATransformStep): + """CountCheckStep class that compares the number of rows in the source + select script with the number of rows in the destination table + """ + + def __init__(self, id, source_script, destination_table_definition, + **kwargs): + """Constructor for the CountCheckStep class + + Args: + source_script(str): SQL select script from the source table + destination_table_definition(file): + table definition for the destination table + **kwargs(optional): Keyword arguments directly passed to base class + """ + with open(parse_path(destination_table_definition)) as f: + destination_table_string = f.read() + + script_arguments = ['--source_script=%s' % source_script, + '--destination_table=%s' % + destination_table_string] + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.COUNT_CHECK_SCRIPT_PATH) + + super(CountCheckStep, self).__init__( + id=id, script=script, script_arguments=script_arguments, **kwargs) diff --git a/dataduct/steps/scripts/count_check_test.py b/dataduct/steps/scripts/count_check_test.py new file mode 100644 index 0000000..81bfd54 --- /dev/null +++ b/dataduct/steps/scripts/count_check_test.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python + +"""Script that compares the number of rows in the source select script with the +number of rows in the destination table +""" + +import argparse +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.database import SqlScript +from dataduct.database import Table +from dataduct.qa import CountCheck + + +def main(): + """Main function + + Args (taken in through argparse): + source_script: SQL script used in the pipeline + destination_table: SQL script of the destination table + """ + + parser = argparse.ArgumentParser() + + parser.add_argument('--source_script', dest='source_script', + required=True) + parser.add_argument('--destination_table', dest='destination_table', + required=True) + parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) + parser.add_argument('--test_name', dest='test_name', + default='Check Count') + + args = parser.parse_args() + + connection = redshift_connection() + source_count = len(pdsql.read_sql(args.source_script, connection)) + destination_table = Table(SqlScript(args.destination_table)) + destination_count = len(pdsql.read_sql( + destination_table.select_script().sql(), + connection)) + + check = CountCheck(source_count, destination_count, name=args.test_name, + sns_topic_arn=args.sns_topic_arn) + check.publish() + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index aa3b15b..9fe69e9 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -26,3 +26,5 @@ DEPENDENCY_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, 'pipeline_dependency_check.py') PK_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, 'primary_key_test.py') +COUNT_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, + 'count_check_test.py') diff --git a/examples/example_count_check.yaml b/examples/example_count_check.yaml new file mode 100644 index 0000000..4fe78d9 --- /dev/null +++ b/examples/example_count_check.yaml @@ -0,0 +1,10 @@ +name : example_count_check +frequency : one-time +load_time : 01:00 + +description : Example for the count-check step + +steps: +- step_type: count-check + source_script: "SELECT * FROM dev.test_table;" + destination_table_definition: examples/tables/dev.test_table2.sql diff --git a/examples/tables/dev.test_table2.sql b/examples/tables/dev.test_table2.sql new file mode 100644 index 0000000..37fe006 --- /dev/null +++ b/examples/tables/dev.test_table2.sql @@ -0,0 +1,4 @@ +CREATE TABLE dev.test_table2( + id INTEGER PRIMARY KEY, + description VARCHAR(255) +); From dc809f30dc24d4fa452c6e29c14aed9dd3d050e2 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Fri, 23 Jan 2015 11:51:27 -0800 Subject: [PATCH 071/175] Travis Integration --- .travis.yml | 37 +++++++++++++++++++++++++++++++++++-- README.rst | 11 +++++++++-- dataduct/qa/check.py | 11 +++++------ 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 779bcb9..be75c9d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,44 @@ language: python +python: + - 2.7 # command to install dependencies install: - sudo apt-get install graphviz + - pip install coveralls - pip install -r requirements.txt -# command to run tests -script: nosetests +# Setup config file +before_script: + - mkdir ~/.dataduct + - |+ + echo " + etl: + ROLE: DataPipelineDefaultRole + RESOURCE_ROLE: DataPipelineDefaultResourceRole + S3_ETL_BUCKET: FILL_ME_IN + + ec2: + CORE_INSTANCE_TYPE: m1.large + + emr: + CLUSTER_AMI: 2.4.7 + + redshift: + DATABASE_NAME: FILL_ME_IN + CLUSTER_ID: FILL_ME_IN + USERNAME: FILL_ME_IN + PASSWORD: FILL_ME_IN + + mysql: + DATABASE_KEY: + HOST: FILL_ME_IN + USERNAME: FILL_ME_IN + PASSWORD: FILL_ME_IN" > ~/.dataduct/dataduct.cfg + +# Run tests +script: nosetests --with-coverage --cover-package=. --cover-erase +after_success: + coveralls # TODO: Setup config file so that we can actually run this diff --git a/README.rst b/README.rst index bea208e..8c76530 100644 --- a/README.rst +++ b/README.rst @@ -1,5 +1,5 @@ -Dataduct ----------- +Dataduct |build-status| |coverage-status| +----------------------------------------- Dataduct is a wrapper built on top of AWS Datapipeline which makes it easy to create ETL jobs. All jobs can be specified as a series of steps in a YAML file and would automatically be translated into datapipeline with appropriate @@ -24,3 +24,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. + +.. |build-status| + image:: https://travis-ci.org/coursera/dataduct.svg?branch=develop + :target: https://travis-ci.org/coursera/dataduct +.. |coverage-status| + image:: https://coveralls.io/repos/coursera/dataduct/badge.svg?branch=develop + :target: https://coveralls.io/r/coursera/dataduct?branch=develop diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index 7a0cfaf..73dd01d 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -5,14 +5,10 @@ from .utils import render_output -config = Config() -SNS_TOPIC_ARN_WARNING = config.etl['SNS_TOPIC_ARN_WARNING'] - - class Check(object): """Base class for QA steps that provides template function for publishing """ - def __init__(self, name, tolerance=0, sns_topic_arn=SNS_TOPIC_ARN_WARNING): + def __init__(self, name, tolerance=0, sns_topic_arn=None): """Constructor for Check class Args: @@ -21,9 +17,12 @@ def __init__(self, name, tolerance=0, sns_topic_arn=SNS_TOPIC_ARN_WARNING): sns_topic_arn(str): sns topic arn for QA test """ self.name = name - self.sns_topic_arn = sns_topic_arn self.tolerance = tolerance self.alert_func = self.get_sns_alert_function() + if sns_topic_arn is None: + config = Config() + sns_topic_arn = config.etl['SNS_TOPIC_ARN_WARNING'] + self.sns_topic_arn = sns_topic_arn def get_sns_alert_function(self): """Get a lamdda function for SNS alert publishing From 7822ea4dd15a5e91c55294946dc02f0b8bfc92d8 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Fri, 23 Jan 2015 17:42:53 -0800 Subject: [PATCH 072/175] Removed todo line --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index be75c9d..13e79b7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -40,5 +40,3 @@ before_script: script: nosetests --with-coverage --cover-package=. --cover-erase after_success: coveralls - -# TODO: Setup config file so that we can actually run this From 3a66ead25265cef077d7f301f44d6b6297e393c1 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 24 Jan 2015 02:48:04 -0800 Subject: [PATCH 073/175] Update README.rst --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 8c76530..9a5fcb8 100644 --- a/README.rst +++ b/README.rst @@ -28,6 +28,7 @@ limitations under the License. .. |build-status| image:: https://travis-ci.org/coursera/dataduct.svg?branch=develop :target: https://travis-ci.org/coursera/dataduct + .. |coverage-status| image:: https://coveralls.io/repos/coursera/dataduct/badge.svg?branch=develop :target: https://coveralls.io/r/coursera/dataduct?branch=develop From 820004c1b03c10ce4bf7ff3a11b52e7a43d17de3 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 29 Jan 2015 00:33:35 -0800 Subject: [PATCH 074/175] Directory path --- dataduct/database/relation.py | 5 +++++ dataduct/pipeline/s3_node.py | 3 ++- dataduct/steps/primary_key_check.py | 4 +++- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/dataduct/database/relation.py b/dataduct/database/relation.py index f8066b6..7f5f310 100644 --- a/dataduct/database/relation.py +++ b/dataduct/database/relation.py @@ -15,6 +15,11 @@ def __str__(self): """ return self.sql_statement + def sql(self): + """SqlStatement for the table object + """ + return self.sql_statement + def copy(self): """Create a copy of the relation object """ diff --git a/dataduct/pipeline/s3_node.py b/dataduct/pipeline/s3_node.py index ad48a57..cbcad98 100644 --- a/dataduct/pipeline/s3_node.py +++ b/dataduct/pipeline/s3_node.py @@ -53,7 +53,8 @@ def __init__(self, raise ETLInputError('Mismatched type for S3 path') additional_args = {} - if isinstance(s3_object, S3Path) and s3_object.is_directory: + if (isinstance(s3_object, S3Path) and s3_object.is_directory) or \ + (isinstance(s3_object, S3Directory)): additional_args['directoryPath'] = s3_object else: additional_args['filePath'] = s3_object diff --git a/dataduct/steps/primary_key_check.py b/dataduct/steps/primary_key_check.py index 28e631d..07ec766 100644 --- a/dataduct/steps/primary_key_check.py +++ b/dataduct/steps/primary_key_check.py @@ -4,6 +4,7 @@ import os from .qa_transform import QATransformStep +from ..database import Table from ..config import Config from ..utils import constants as const from ..utils.helpers import parse_path @@ -25,7 +26,8 @@ def __init__(self, id, table_definition, **kwargs): with open(parse_path(table_definition)) as f: table_def_string = f.read() - script_arguments = ['--table=%s' % table_def_string] + # We initisialize the table object to check valid strings + script_arguments = ['--table=%s' % Table(table_def_string).sql()] steps_path = os.path.abspath(os.path.dirname(__file__)) script = os.path.join(steps_path, const.PK_CHECK_SCRIPT_PATH) From 8c1f3a4cf161a87c812ab8f5eb6521cbf9d8e38b Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 9 Feb 2015 12:02:38 -0800 Subject: [PATCH 075/175] S3 directory fix in extract-s3 step --- dataduct/steps/extract_s3.py | 17 ++++++++++++++--- dataduct/steps/primary_key_check.py | 2 +- docs/creating_an_etl.rst | 2 +- examples/example_extract_s3.yaml | 2 +- 4 files changed, 17 insertions(+), 6 deletions(-) diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index bfe91cf..de7850c 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -3,21 +3,32 @@ """ from .etl_step import ETLStep from ..s3 import S3Path +from ..utils.helpers import exactly_one +from ..utils.exceptions import ETLInputError class ExtractS3Step(ETLStep): """ExtractS3 Step class that helps get data from S3 """ - def __init__(self, uri, **kwargs): + def __init__(self, directory_uri=None, file_uri=None, **kwargs): """Constructor for the ExtractS3Step class Args: - uri(str): s3 path for s3 data + directory_uri(str): s3 path for s3 data directory + file_uri(str): s3 path for s3 data file **kwargs(optional): Keyword arguments directly passed to base class """ + if not exactly_one(directory_uri, file_uri): + raise ETLInputError('One of file_uri or directory_uri needed') + super(ExtractS3Step, self).__init__(**kwargs) - self._output = self.create_s3_data_node(S3Path(uri=uri)) + + if directory_uri: + s3_path = S3Path(uri=directory_uri, is_directory=True) + else: + s3_path = S3Path(uri=file_uri) + self._output = self.create_s3_data_node(s3_path) @classmethod def arguments_processor(cls, etl, input_args): diff --git a/dataduct/steps/primary_key_check.py b/dataduct/steps/primary_key_check.py index 07ec766..66c7100 100644 --- a/dataduct/steps/primary_key_check.py +++ b/dataduct/steps/primary_key_check.py @@ -26,7 +26,7 @@ def __init__(self, id, table_definition, **kwargs): with open(parse_path(table_definition)) as f: table_def_string = f.read() - # We initisialize the table object to check valid strings + # We initialize the table object to check valid strings script_arguments = ['--table=%s' % Table(table_def_string).sql()] steps_path = os.path.abspath(os.path.dirname(__file__)) diff --git a/docs/creating_an_etl.rst b/docs/creating_an_etl.rst index 3ef5ece..3e3d753 100644 --- a/docs/creating_an_etl.rst +++ b/docs/creating_an_etl.rst @@ -162,7 +162,7 @@ S3 node. .. code:: yaml - step_type: extract-s3 - uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py + file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py load-redshift ^^^^^^^^^^^^^ diff --git a/examples/example_extract_s3.yaml b/examples/example_extract_s3.yaml index f683976..febaf6f 100644 --- a/examples/example_extract_s3.yaml +++ b/examples/example_extract_s3.yaml @@ -7,4 +7,4 @@ description : | steps: - step_type: extract-s3 - uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py + file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py From bd713b03fb076d240eca92fa4cfe2d3a78e4584c Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 9 Feb 2015 12:09:59 -0800 Subject: [PATCH 076/175] fix unit tests for relations to not grant --- dataduct/database/database.py | 5 +++-- dataduct/database/relation.py | 1 + dataduct/database/tests/test_database.py | 12 ++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/dataduct/database/database.py b/dataduct/database/database.py index aa64d78..b25ea46 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -167,7 +167,7 @@ def recreate_relations_script(self, grant_permissions=True): return self.relations_script( 'recreate_script', grant_permissions=grant_permissions) - def recreate_table_dependencies(self, table_name): + def recreate_table_dependencies(self, table_name, grant_permissions=True): """Recreate the dependencies for a particular table from the database """ result = SqlScript() @@ -190,7 +190,8 @@ def recreate_table_dependencies(self, table_name): if isinstance(relation, View): # Recreate view if pointing to table if table_name in relation.dependencies: - result.append(relation.recreate_script()) + result.append(relation.recreate_script( + grant_permissions=grant_permissions)) return result @staticmethod diff --git a/dataduct/database/relation.py b/dataduct/database/relation.py index 7f5f310..c87901c 100644 --- a/dataduct/database/relation.py +++ b/dataduct/database/relation.py @@ -52,6 +52,7 @@ def _grant_sql_builder(self, permission, user=None, group=None): if group is not None: result.append(base + 'GROUP %s' % group) + return result def grant_script(self): """Grant the permissions based on the config diff --git a/dataduct/database/tests/test_database.py b/dataduct/database/tests/test_database.py index 2fad6c1..449010b 100644 --- a/dataduct/database/tests/test_database.py +++ b/dataduct/database/tests/test_database.py @@ -194,7 +194,8 @@ def test_database_create_relations_script(self): result = ('CREATE TABLE test_table ( id INTEGER );\n' 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') - self._test_database_scripts('create_relations_script', result) + self._test_database_scripts('create_relations_script', result, + grant_permissions=False) def test_database_drop_relations_script(self): """Dropping relations in the database @@ -210,7 +211,8 @@ def test_database_recreate_relations_script(self): 'CREATE TABLE test_table ( id INTEGER );\n' 'DROP VIEW IF EXISTS test_view CASCADE;\n' 'CREATE VIEW test_view AS ( SELECT id FROM test_table );') - self._test_database_scripts('recreate_relations_script', result) + self._test_database_scripts('recreate_relations_script', result, + grant_permissions=False) def test_database_recreate_table_dependencies(self): """Recreating table dependencies @@ -226,5 +228,7 @@ def test_database_recreate_table_dependencies(self): 'REFERENCES second_table (id2);\n' 'DROP VIEW IF EXISTS view CASCADE;\n' 'CREATE VIEW view AS ( SELECT id1 FROM second_table );') - eq_(database.recreate_table_dependencies('second_table').sql(), result) - eq_(database.recreate_table_dependencies('first_table').sql(), ';') + eq_(database.recreate_table_dependencies('second_table', False).sql(), + result) + eq_(database.recreate_table_dependencies('first_table', False).sql(), + ';') From 8377598083d7804c915e593f86987c37e2a10c2c Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 02:15:11 -0800 Subject: [PATCH 077/175] column check --- dataduct/database/tests/test_history_table.py | 2 + dataduct/qa/check.py | 4 +- dataduct/steps/column_check.py | 115 ++++++++++++++++-- dataduct/steps/scripts/column_check_test.py | 91 +++++++++++--- examples/example_column_check.yaml | 7 +- 5 files changed, 188 insertions(+), 31 deletions(-) diff --git a/dataduct/database/tests/test_history_table.py b/dataduct/database/tests/test_history_table.py index 9a1c6be..7318370 100644 --- a/dataduct/database/tests/test_history_table.py +++ b/dataduct/database/tests/test_history_table.py @@ -15,10 +15,12 @@ class TestHistoryTable(TestCase): @staticmethod def _create_history_table(sql): + """Helper function""" return HistoryTable(SqlScript(sql)) @staticmethod def _create_table(sql): + """Helper function""" return Table(SqlScript(sql)) def setUp(self): diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index 73dd01d..a422c92 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -18,11 +18,11 @@ def __init__(self, name, tolerance=0, sns_topic_arn=None): """ self.name = name self.tolerance = tolerance - self.alert_func = self.get_sns_alert_function() if sns_topic_arn is None: config = Config() - sns_topic_arn = config.etl['SNS_TOPIC_ARN_WARNING'] + sns_topic_arn = config.etl.get('SNS_TOPIC_ARN_WARNING', None) self.sns_topic_arn = sns_topic_arn + self.alert_func = self.get_sns_alert_function() def get_sns_alert_function(self): """Get a lamdda function for SNS alert publishing diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py index 3cdc936..fb7d250 100644 --- a/dataduct/steps/column_check.py +++ b/dataduct/steps/column_check.py @@ -4,10 +4,16 @@ from .qa_transform import QATransformStep from ..config import Config +from ..database import SqlScript +from ..database import Table +from ..database import SelectStatement from ..utils import constants as const from ..utils.helpers import parse_path +from ..utils.helpers import exactly_one +from ..utils.exceptions import ETLInputError config = Config() +COLUMN_TEMPLATE = "COALESCE(CONCAT({column_name}, ''), '')" class ColumnCheckStep(QATransformStep): @@ -15,28 +21,115 @@ class ColumnCheckStep(QATransformStep): populated with the correct values """ - def __init__(self, id, source_table_definition, - destination_table_definition, **kwargs): + def __init__(self, id, source_sql, source_host, + destination_table_definition=None, + destination_sql=None, sql_tail_for_source=None, + sample_size=100, tolerance=0.01, script_arguments=None, + **kwargs): """Constructor for the ColumnCheckStep class Args: - source_table_definition(file): - table definition for the source table destination_table_definition(file): table definition for the destination table **kwargs(optional): Keyword arguments directly passed to base class """ - with open(parse_path(source_table_definition)) as f: - source_table_string = f.read() - with open(parse_path(destination_table_definition)) as f: - destination_table_string = f.read() - script_arguments = ['--source_table=%s' % source_table_string, - '--destination_table=%s' - % destination_table_string] + if not exactly_one(destination_table_definition, destination_sql): + raise ETLInputError('One of dest table or dest sql needed') + + if script_arguments is None: + script_arguments = list() + + if sql_tail_for_source is None: + sql_tail_for_source = '' + + # Get the EDW column SQL + dest_sql, primary_key_index = self.convert_destination_to_column_sql( + destination_table_definition, destination_sql) + + src_sql = self.convert_source_to_column_sql(source_sql, + primary_key_index, + sql_tail_for_source) + + script_arguments.extend([ + '--sample_size=%s' % str(sample_size), + '--tolerance=%s' % str(tolerance), + '--destination_sql=%s' % dest_sql, + '--source_sql=%s' % src_sql, + '--source_host=%s' % source_host + ]) steps_path = os.path.abspath(os.path.dirname(__file__)) script = os.path.join(steps_path, const.COLUMN_CHECK_SCRIPT_PATH) super(ColumnCheckStep, self).__init__( id=id, script=script, script_arguments=script_arguments, **kwargs) + + @staticmethod + def convert_destination_to_column_sql(destination_table_definition=None, + destination_sql=None): + """Convert the destination query into generic structure to compare + """ + if destination_table_definition is not None: + with open(parse_path(destination_table_definition)) as f: + destination_table_string = f.read() + + destination_table = Table(SqlScript(destination_table_string)) + destination_columns = destination_table.columns() + primary_key_index, primary_keys = zip(*[ + (idx, col.name) + for idx, col in enumerate(destination_columns.columns()) + if col.primary]) + + if len(destination_columns) == len(primary_key_index): + raise ValueError('Cannot check table without non-pk columns') + + column_string = '||'.join( + [COLUMN_TEMPLATE.format(column_name=c.name) + for c in destination_columns if not c.primary]) + concatenated_column = '( {columns} )'.format(columns=column_string) + + destination_sql = '''SELECT {primary_keys}, {concat_column} + FROM {table_name} + WHERE ({primary_keys}) IN PRIMARY_KEY_SET + '''.format(primary_keys=','.join(primary_keys), + concat_column=concatenated_column, + table_name=destination_table.full_name) + + elif destination_sql is not None: + select_stmnt = SelectStatement(destination_sql) + primary_key_index = range(len(select_stmnt.columns()))[:-1] + + return SqlScript(destination_sql).sql(), primary_key_index + + @staticmethod + def convert_source_to_column_sql(source_sql, primary_key_index, + sql_tail_for_source): + """Convert the source query into generic structure to compare + """ + origin_sql = SelectStatement(SqlScript(source_sql).statements[0].sql()) + column_names = [x.name for x in origin_sql.columns()] + + non_primary_key_index = [idx for idx in range(len(column_names)) + if idx not in primary_key_index] + + primary_key_str = ','.join( + [column_names[idx] for idx in primary_key_index]) + + if len(column_names) == len(primary_key_index): + raise ValueError('Cannot check column on table with no pk columns') + + column_string = ','.join( + [COLUMN_TEMPLATE.format(column_name=column_names[idx]) + for idx in non_primary_key_index]) + concatenated_column = ('CONCAT(%s)' % column_string) + + template = '''SELECT {primary_keys}, {concat_column} + FROM ({origin_sql}) AS origin {sql_tail}''' + + query = template.format(primary_keys=primary_key_str, + concat_column=concatenated_column, + origin_sql=origin_sql.sql(), + sql_tail=sql_tail_for_source) + + return SqlScript(query).sql() diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py index d76e9a6..2d75f5e 100644 --- a/dataduct/steps/scripts/column_check_test.py +++ b/dataduct/steps/scripts/column_check_test.py @@ -5,14 +5,16 @@ """ import argparse +import collections +import re import pandas.io.sql as pdsql from dataduct.data_access import redshift_connection +from dataduct.data_access import rds_connection from dataduct.database import SqlScript -from dataduct.database import Table from dataduct.qa import ColumnCheck -def _get_data(sql, connection): +def _get_source_data(sql, hostname, sample_size): """Gets the DataFrame containing all the rows of the table The DataFrame will be indexed by the table's primary key(s) @@ -23,24 +25,79 @@ def _get_data(sql, connection): Returns: DataFrame: The rows of the table """ - table = Table(SqlScript(sql)) - return pdsql.read_sql(table.select_script().sql(), - connection, - index_col=table.primary_key_names) + connection = rds_connection(hostname) + query = re.sub( + r'(?i)LIMIT_PLACEHOLDER', + str(sample_size), + sql, + ) + + data = pdsql.read_sql(query, connection) + connection.close() + # All columns apart from last are PK columns + return data.set_index(list(data.columns[:-1])) + + +def _get_destination_data(sql, primary_keys): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + + Returns: + DataFrame: The rows of the table + """ + connection = redshift_connection() + + # Make primary_keys always a list of tuples + if isinstance(primary_keys[0], basestring): + primary_keys = [(pk) for pk in primary_keys] + + # Check whether it is not iterable + if not isinstance(primary_keys, collections.Iterable): + primary_keys = [tuple([pk]) for pk in primary_keys] + + # Format primary key string + primary_key_string = re.sub( + r",\)", + ")", + str(tuple(primary_keys)) + ) + + # If a key is Timestamp, the output string needs to be fixed. + # e.g., from Timestamp('2014-06-09 05:13:11') to '2014-06-09 05:13:11' + primary_key_string = re.sub(r"Timestamp\(([^,]*)[^)]*\)", r"\1", + primary_key_string) + + query = re.sub( + r'(?i)PRIMARY_KEY_SET', + primary_key_string, + sql, + ) + + data = pdsql.read_sql(query, connection) + connection.close() + # All columns apart from last are PK columns + return data.set_index(list(data.columns[:-1])) def main(): """Main function Args (taken in through argparse): - source_table: SQL script of the source table - destination_table: SQL script of the destination table + source_sql: SQL script of the source data + destination_sql: SQL script of the destination data """ parser = argparse.ArgumentParser() - parser.add_argument('--source_table', dest='source_table', required=True) - parser.add_argument('--destination_table', dest='destination_table', + parser.add_argument('--source_sql', dest='source_sql', required=True) + parser.add_argument('--source_host', dest='source_host', required=True) + parser.add_argument('--destination_sql', dest='destination_sql', required=True) + parser.add_argument('--sample_size', dest='sample_size', required=True) + parser.add_argument('--tolerance', type=float, dest='tolerance', + default=0.1) parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default='Check Column') @@ -48,14 +105,16 @@ def main(): args = parser.parse_args() # Open up a connection and read the source and destination tables - connection = redshift_connection() - source_data = _get_data(args.source_table, connection) - destination_data = _get_data(args.destination_table, connection) + source_data = _get_source_data(args.source_sql, args.source_host, + args.sample_size) + destination_data = _get_destination_data(args.destination_sql, + list(source_data.index)) - check = ColumnCheck(source_data, destination_data, name=args.test_name, - sns_topic_arn=args.sns_topic_arn) + check = ColumnCheck(source_data, destination_data, + name=args.test_name, + sns_topic_arn=args.sns_topic_arn, + tolerance=args.tolerance) check.publish() - connection.close() if __name__ == '__main__': diff --git a/examples/example_column_check.yaml b/examples/example_column_check.yaml index f57f33c..9ff634c 100644 --- a/examples/example_column_check.yaml +++ b/examples/example_column_check.yaml @@ -6,5 +6,8 @@ description : Example for the column-check step steps: - step_type: column-check - source_table_definition: examples/tables/dev.test_table.sql - destination_table_definition: examples/tables/dev.test_table2.sql + source_sql: "SELECT id, name FROM networks_network" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" + sample_size: 10 From d1080c5e3ed78d72037b9a54bfdf5c62eb448f3a Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 02:17:05 -0800 Subject: [PATCH 078/175] remove table --- examples/tables/dev.test_table2.sql | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 examples/tables/dev.test_table2.sql diff --git a/examples/tables/dev.test_table2.sql b/examples/tables/dev.test_table2.sql deleted file mode 100644 index 37fe006..0000000 --- a/examples/tables/dev.test_table2.sql +++ /dev/null @@ -1,4 +0,0 @@ -CREATE TABLE dev.test_table2( - id INTEGER PRIMARY KEY, - description VARCHAR(255) -); From f87f4bfc3ae49a2491fe7d0fd41a310fcea9742d Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 03:15:49 -0800 Subject: [PATCH 079/175] fix the step --- dataduct/steps/column_check.py | 2 +- dataduct/steps/count_check.py | 67 +++++++++++++++++--- dataduct/steps/scripts/column_check_test.py | 3 +- dataduct/steps/scripts/count_check_test.py | 68 +++++++++++++++------ examples/example_column_check.yaml | 2 +- examples/example_count_check.yaml | 6 +- examples/tables/dev.test_table2.sql | 4 -- 7 files changed, 114 insertions(+), 38 deletions(-) delete mode 100644 examples/tables/dev.test_table2.sql diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py index fb7d250..99a54fa 100644 --- a/dataduct/steps/column_check.py +++ b/dataduct/steps/column_check.py @@ -24,7 +24,7 @@ class ColumnCheckStep(QATransformStep): def __init__(self, id, source_sql, source_host, destination_table_definition=None, destination_sql=None, sql_tail_for_source=None, - sample_size=100, tolerance=0.01, script_arguments=None, + sample_size=100, tolerance=1.0, script_arguments=None, **kwargs): """Constructor for the ColumnCheckStep class diff --git a/dataduct/steps/count_check.py b/dataduct/steps/count_check.py index 812d20b..8a243e5 100644 --- a/dataduct/steps/count_check.py +++ b/dataduct/steps/count_check.py @@ -4,8 +4,11 @@ from .qa_transform import QATransformStep from ..config import Config +from ..database import SqlScript +from ..database import SqlStatement from ..utils import constants as const -from ..utils.helpers import parse_path +from ..utils.helpers import exactly_one +from ..utils.exceptions import ETLInputError config = Config() @@ -15,25 +18,69 @@ class CountCheckStep(QATransformStep): select script with the number of rows in the destination table """ - def __init__(self, id, source_script, destination_table_definition, + def __init__(self, id, source_host, source_sql=None, source_table_name=None, + destination_table_name=None, destination_sql=None, + tolerance=1.0, script_arguments=None, **kwargs): """Constructor for the CountCheckStep class Args: - source_script(str): SQL select script from the source table - destination_table_definition(file): - table definition for the destination table + source_sql(str): SQL select script from the source table + destination_table_name(str): table name for the destination table **kwargs(optional): Keyword arguments directly passed to base class """ - with open(parse_path(destination_table_definition)) as f: - destination_table_string = f.read() - script_arguments = ['--source_script=%s' % source_script, - '--destination_table=%s' % - destination_table_string] + if not exactly_one(destination_table_name, destination_sql): + raise ETLInputError('One of dest table or dest sql needed') + + if not exactly_one(source_sql, source_table_name): + raise ETLInputError('One of dest table or dest sql needed') + + if script_arguments is None: + script_arguments = list() + + # Get the EDW column SQL + dest_sql = self.convert_destination_to_count_sql( + destination_table_name, destination_sql) + + src_sql = self.convert_source_to_count_sql( + source_table_name, source_sql) + + script_arguments.extend([ + '--tolerance=%s' % str(tolerance), + '--destination_sql=%s' % dest_sql, + '--source_sql=%s' % src_sql, + '--source_host=%s' % source_host + ]) steps_path = os.path.abspath(os.path.dirname(__file__)) script = os.path.join(steps_path, const.COUNT_CHECK_SCRIPT_PATH) super(CountCheckStep, self).__init__( id=id, script=script, script_arguments=script_arguments, **kwargs) + + @staticmethod + def convert_destination_to_count_sql(destination_table_name=None, + destination_sql=None): + """Convert the destination query into generic structure to compare + """ + if destination_table_name is not None: + destination_sql = "SELECT COUNT(1) FROM %s" % destination_table_name + else: + dest_sql = SqlStatement(destination_sql) + destination_sql = "SELECT COUNT(1) FROM (%s)a" % dest_sql.sql() + + return SqlScript(destination_sql).sql() + + @staticmethod + def convert_source_to_count_sql(source_table_name=None, + source_sql=None): + """Convert the source query into generic structure to compare + """ + if source_table_name is not None: + source_sql = "SELECT COUNT(1) FROM %s" % source_table_name + else: + origin_sql = SqlStatement(source_sql) + source_sql = "SELECT COUNT(1) FROM (%s)a" % origin_sql.sql() + + return SqlScript(source_sql).sql() diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py index 2d75f5e..82f80d3 100644 --- a/dataduct/steps/scripts/column_check_test.py +++ b/dataduct/steps/scripts/column_check_test.py @@ -10,7 +10,6 @@ import pandas.io.sql as pdsql from dataduct.data_access import redshift_connection from dataduct.data_access import rds_connection -from dataduct.database import SqlScript from dataduct.qa import ColumnCheck @@ -97,7 +96,7 @@ def main(): required=True) parser.add_argument('--sample_size', dest='sample_size', required=True) parser.add_argument('--tolerance', type=float, dest='tolerance', - default=0.1) + default=1.0) parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default='Check Column') diff --git a/dataduct/steps/scripts/count_check_test.py b/dataduct/steps/scripts/count_check_test.py index 81bfd54..49d2ec8 100644 --- a/dataduct/steps/scripts/count_check_test.py +++ b/dataduct/steps/scripts/count_check_test.py @@ -7,42 +7,74 @@ import argparse import pandas.io.sql as pdsql from dataduct.data_access import redshift_connection -from dataduct.database import SqlScript -from dataduct.database import Table +from dataduct.data_access import rds_connection from dataduct.qa import CountCheck +def _get_source_data(sql, hostname): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + connection(Connection): A connection to the database + + Returns: + DataFrame: The rows of the table + """ + connection = rds_connection(hostname) + data = pdsql.read_sql(sql, connection) + connection.close() + return data.iloc[0][0] + + +def _get_destination_data(sql): + """Gets the DataFrame containing all the rows of the table + The DataFrame will be indexed by the table's primary key(s) + + Args: + sql(str): The table definition representing the table to query + connection(Connection): A connection to the database + + Returns: + DataFrame: The rows of the table + """ + connection = redshift_connection() + data = pdsql.read_sql(sql, connection) + connection.close() + # All columns apart from last are PK columns + return data.iloc[0][0] + + def main(): """Main function Args (taken in through argparse): - source_script: SQL script used in the pipeline - destination_table: SQL script of the destination table + source_sql: SQL script of the source data + destination_sql: SQL script of the destination data """ - parser = argparse.ArgumentParser() - parser.add_argument('--source_script', dest='source_script', - required=True) - parser.add_argument('--destination_table', dest='destination_table', + parser.add_argument('--source_sql', dest='source_sql', required=True) + parser.add_argument('--source_host', dest='source_host', required=True) + parser.add_argument('--destination_sql', dest='destination_sql', required=True) + parser.add_argument('--tolerance', type=float, dest='tolerance', + default=1.0) parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default='Check Count') args = parser.parse_args() - connection = redshift_connection() - source_count = len(pdsql.read_sql(args.source_script, connection)) - destination_table = Table(SqlScript(args.destination_table)) - destination_count = len(pdsql.read_sql( - destination_table.select_script().sql(), - connection)) - - check = CountCheck(source_count, destination_count, name=args.test_name, - sns_topic_arn=args.sns_topic_arn) + source_count = _get_source_data(args.source_sql, args.source_host) + destination_count = _get_destination_data(args.destination_sql) + + check = CountCheck(source_count, destination_count, + name=args.test_name, + sns_topic_arn=args.sns_topic_arn, + tolerance=args.tolerance) check.publish() - connection.close() if __name__ == '__main__': diff --git a/examples/example_column_check.yaml b/examples/example_column_check.yaml index 9ff634c..be582c8 100644 --- a/examples/example_column_check.yaml +++ b/examples/example_column_check.yaml @@ -6,7 +6,7 @@ description : Example for the column-check step steps: - step_type: column-check - source_sql: "SELECT id, name FROM networks_network" + source_sql: "SELECT id, NAME FROM networks_network;" source_host: maestro destination_sql: "SELECT network_id, network_name FROM prod.networks" sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" diff --git a/examples/example_count_check.yaml b/examples/example_count_check.yaml index 4fe78d9..5997d0a 100644 --- a/examples/example_count_check.yaml +++ b/examples/example_count_check.yaml @@ -6,5 +6,7 @@ description : Example for the count-check step steps: - step_type: count-check - source_script: "SELECT * FROM dev.test_table;" - destination_table_definition: examples/tables/dev.test_table2.sql + source_sql: "SELECT id, NAME FROM networks_network;" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + tolerance: 2.0 diff --git a/examples/tables/dev.test_table2.sql b/examples/tables/dev.test_table2.sql deleted file mode 100644 index 37fe006..0000000 --- a/examples/tables/dev.test_table2.sql +++ /dev/null @@ -1,4 +0,0 @@ -CREATE TABLE dev.test_table2( - id INTEGER PRIMARY KEY, - description VARCHAR(255) -); From ee2523d296e467f4877d0f20cdd231b250df79cc Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 03:17:14 -0800 Subject: [PATCH 080/175] fix typo --- examples/example_column_check.yaml | 2 +- examples/example_count_check.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/example_column_check.yaml b/examples/example_column_check.yaml index be582c8..55cc30a 100644 --- a/examples/example_column_check.yaml +++ b/examples/example_column_check.yaml @@ -6,7 +6,7 @@ description : Example for the column-check step steps: - step_type: column-check - source_sql: "SELECT id, NAME FROM networks_network;" + source_sql: "SELECT id, name FROM networks_network;" source_host: maestro destination_sql: "SELECT network_id, network_name FROM prod.networks" sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" diff --git a/examples/example_count_check.yaml b/examples/example_count_check.yaml index 5997d0a..5afdbf5 100644 --- a/examples/example_count_check.yaml +++ b/examples/example_count_check.yaml @@ -6,7 +6,7 @@ description : Example for the count-check step steps: - step_type: count-check - source_sql: "SELECT id, NAME FROM networks_network;" + source_sql: "SELECT id, name FROM networks_network;" source_host: maestro destination_sql: "SELECT network_id, network_name FROM prod.networks" tolerance: 2.0 From f77198c602896d3733ec471ad24690614effb3fc Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 15:54:51 -0800 Subject: [PATCH 081/175] Refactor resources --- dataduct/__init__.py | 2 +- dataduct/steps/etl_step.py | 11 ++++++++++- dataduct/steps/primary_key_check.py | 4 +++- examples/example_custom_extract_local.yaml | 2 +- examples/example_double_input.yaml | 6 +++--- examples/example_double_output.yaml | 8 ++++---- examples/example_emr_streaming.yaml | 8 ++++---- examples/example_extract_local.yaml | 2 +- examples/example_load_redshift.yaml | 2 +- examples/example_primary_key_check.yaml | 2 +- examples/example_sql_command.yaml | 2 +- examples/example_transform.yaml | 6 +++--- examples/resources/{ => data}/test_table1.tsv | 0 examples/resources/{ => data}/test_table2.tsv | 0 examples/resources/{ => data}/word_data.txt | 0 examples/{ => resources}/scripts/s3_profiler.py | 0 examples/{ => resources}/scripts/word_mapper.py | 0 examples/{ => resources}/scripts/word_reducer.py | 0 examples/{ => resources}/tables/categories.sql | 0 examples/{ => resources}/tables/customers.sql | 0 examples/{ => resources}/tables/dev.test_table.sql | 0 examples/{ => resources}/tables/employees.sql | 0 examples/{ => resources}/tables/order_details.sql | 0 examples/{ => resources}/tables/orders.sql | 0 examples/{ => resources}/tables/products.sql | 0 examples/{ => resources}/tables/shippers.sql | 0 examples/{ => resources}/tables/suppliers.sql | 0 examples/{scripts => steps}/custom_extract_local.py | 4 +++- setup.py | 7 +++++-- 29 files changed, 41 insertions(+), 25 deletions(-) rename examples/resources/{ => data}/test_table1.tsv (100%) rename examples/resources/{ => data}/test_table2.tsv (100%) rename examples/resources/{ => data}/word_data.txt (100%) rename examples/{ => resources}/scripts/s3_profiler.py (100%) rename examples/{ => resources}/scripts/word_mapper.py (100%) rename examples/{ => resources}/scripts/word_reducer.py (100%) rename examples/{ => resources}/tables/categories.sql (100%) rename examples/{ => resources}/tables/customers.sql (100%) rename examples/{ => resources}/tables/dev.test_table.sql (100%) rename examples/{ => resources}/tables/employees.sql (100%) rename examples/{ => resources}/tables/order_details.sql (100%) rename examples/{ => resources}/tables/orders.sql (100%) rename examples/{ => resources}/tables/products.sql (100%) rename examples/{ => resources}/tables/shippers.sql (100%) rename examples/{ => resources}/tables/suppliers.sql (100%) rename examples/{scripts => steps}/custom_extract_local.py (82%) diff --git a/dataduct/__init__.py b/dataduct/__init__.py index 8bce1d9..dd420db 100644 --- a/dataduct/__init__.py +++ b/dataduct/__init__.py @@ -1,4 +1,4 @@ """Welcome to DataDuct """ -__version__ = '0.1.0' +__version__ = '0.2.0' __import__('pkg_resources').declare_namespace(__name__) diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 8e13815..ed3e252 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -225,12 +225,21 @@ def copy_s3(self, input_node, dest_uri): # create s3 node for output output_node = self.create_s3_data_node(dest_uri) + # Create new input node if file and not directory + if input_node.path().is_directory: + new_input_node = input_node + else: + uri = "/".join(input_node.path().uri.split("/")[:-1]) + new_input_node = self.create_s3_data_node( + S3Path(uri=uri, is_directory=True)) + new_input_node.add_dependency_node(input_node) + # create copy activity activity = self.create_pipeline_object( CopyActivity, schedule=self.schedule, resource=self.resource, - input_node=input_node, + input_node=new_input_node, output_node=output_node, max_retries=self.max_retries ) diff --git a/dataduct/steps/primary_key_check.py b/dataduct/steps/primary_key_check.py index 66c7100..d366654 100644 --- a/dataduct/steps/primary_key_check.py +++ b/dataduct/steps/primary_key_check.py @@ -5,6 +5,7 @@ from .qa_transform import QATransformStep from ..database import Table +from ..database import SqlStatement from ..config import Config from ..utils import constants as const from ..utils.helpers import parse_path @@ -27,7 +28,8 @@ def __init__(self, id, table_definition, **kwargs): table_def_string = f.read() # We initialize the table object to check valid strings - script_arguments = ['--table=%s' % Table(table_def_string).sql()] + script_arguments = [ + '--table=%s' % Table(SqlStatement(table_def_string)).sql()] steps_path = os.path.abspath(os.path.dirname(__file__)) script = os.path.join(steps_path, const.PK_CHECK_SCRIPT_PATH) diff --git a/examples/example_custom_extract_local.yaml b/examples/example_custom_extract_local.yaml index aeb8d54..809df06 100644 --- a/examples/example_custom_extract_local.yaml +++ b/examples/example_custom_extract_local.yaml @@ -7,4 +7,4 @@ description : | steps: - step_type: custom-extract-local - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv diff --git a/examples/example_double_input.yaml b/examples/example_double_input.yaml index e4c8913..3cd3353 100644 --- a/examples/example_double_input.yaml +++ b/examples/example_double_input.yaml @@ -7,14 +7,14 @@ description : Example for the transform step with multiple inputs steps: - step_type: extract-local name: step1 - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv - step_type: extract-local name: step2 - path: examples/resources/test_table2.tsv + path: data/test_table2.tsv - step_type: transform - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py input_node: step1: script step2: directory diff --git a/examples/example_double_output.yaml b/examples/example_double_output.yaml index e55d943..9c06d07 100644 --- a/examples/example_double_output.yaml +++ b/examples/example_double_output.yaml @@ -7,11 +7,11 @@ description : Example for the transform step with multiple outputs steps: - step_type: extract-local name: step1_a - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv - step_type: extract-local name: step1_b - path: examples/resources/test_table2.tsv + path: data/test_table2.tsv - step_type: transform command: cp -r $INPUT1_STAGING_DIR/* $OUTPUT1_STAGING_DIR @@ -24,7 +24,7 @@ steps: - step_type: transform name: profiler_1 - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py input_node: step2_a script_arguments: - --input=INPUT1_STAGING_DIR @@ -33,7 +33,7 @@ steps: - step_type: transform name: profiler_2 - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py input_node: step2_b script_arguments: - --input=INPUT1_STAGING_DIR diff --git a/examples/example_emr_streaming.yaml b/examples/example_emr_streaming.yaml index 6d14fdd..1acc0ec 100644 --- a/examples/example_emr_streaming.yaml +++ b/examples/example_emr_streaming.yaml @@ -10,14 +10,14 @@ description : Example for the emr_streaming step steps: - step_type: extract-local - path: examples/resources/word_data.txt + path: data/word_data.txt - step_type: emr-streaming - mapper: examples/scripts/word_mapper.py - reducer: examples/scripts/word_reducer.py + mapper: scripts/word_mapper.py + reducer: scripts/word_reducer.py - step_type: transform - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR diff --git a/examples/example_extract_local.yaml b/examples/example_extract_local.yaml index 005de12..377be7e 100644 --- a/examples/example_extract_local.yaml +++ b/examples/example_extract_local.yaml @@ -7,4 +7,4 @@ description : | steps: - step_type: extract-local - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv diff --git a/examples/example_load_redshift.yaml b/examples/example_load_redshift.yaml index 735a386..1082641 100644 --- a/examples/example_load_redshift.yaml +++ b/examples/example_load_redshift.yaml @@ -6,7 +6,7 @@ description : Example for the load_redshift step steps: - step_type: extract-local - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv - step_type: load-redshift schema: dev diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml index c8c2218..a482e50 100644 --- a/examples/example_primary_key_check.yaml +++ b/examples/example_primary_key_check.yaml @@ -6,4 +6,4 @@ description : Example for the primary-key-check step steps: - step_type: primary-key-check - table_definition: examples/tables/dev.test_table.sql + table_definition: tables/dev.test_table.sql diff --git a/examples/example_sql_command.yaml b/examples/example_sql_command.yaml index e80aa0e..de8c180 100644 --- a/examples/example_sql_command.yaml +++ b/examples/example_sql_command.yaml @@ -6,4 +6,4 @@ description : Example for the sql_command step steps: - step_type: sql-command - command: INSERT INTO dev.test_table VALUES (1, 'hello_etl'); + command: SELECT * FROM dev.test_table; diff --git a/examples/example_transform.yaml b/examples/example_transform.yaml index e82b1ee..a9c3423 100644 --- a/examples/example_transform.yaml +++ b/examples/example_transform.yaml @@ -7,18 +7,18 @@ description : Example for the transform step steps: - step_type: extract-local name: extract-node - path: examples/resources/test_table1.tsv + path: data/test_table1.tsv - step_type: transform input_node: extract-node - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR - step_type: transform input_node: extract-node - script_directory: examples/scripts/ + script_directory: scripts/ script_name: s3_profiler.py script_arguments: - --input=INPUT1_STAGING_DIR diff --git a/examples/resources/test_table1.tsv b/examples/resources/data/test_table1.tsv similarity index 100% rename from examples/resources/test_table1.tsv rename to examples/resources/data/test_table1.tsv diff --git a/examples/resources/test_table2.tsv b/examples/resources/data/test_table2.tsv similarity index 100% rename from examples/resources/test_table2.tsv rename to examples/resources/data/test_table2.tsv diff --git a/examples/resources/word_data.txt b/examples/resources/data/word_data.txt similarity index 100% rename from examples/resources/word_data.txt rename to examples/resources/data/word_data.txt diff --git a/examples/scripts/s3_profiler.py b/examples/resources/scripts/s3_profiler.py similarity index 100% rename from examples/scripts/s3_profiler.py rename to examples/resources/scripts/s3_profiler.py diff --git a/examples/scripts/word_mapper.py b/examples/resources/scripts/word_mapper.py similarity index 100% rename from examples/scripts/word_mapper.py rename to examples/resources/scripts/word_mapper.py diff --git a/examples/scripts/word_reducer.py b/examples/resources/scripts/word_reducer.py similarity index 100% rename from examples/scripts/word_reducer.py rename to examples/resources/scripts/word_reducer.py diff --git a/examples/tables/categories.sql b/examples/resources/tables/categories.sql similarity index 100% rename from examples/tables/categories.sql rename to examples/resources/tables/categories.sql diff --git a/examples/tables/customers.sql b/examples/resources/tables/customers.sql similarity index 100% rename from examples/tables/customers.sql rename to examples/resources/tables/customers.sql diff --git a/examples/tables/dev.test_table.sql b/examples/resources/tables/dev.test_table.sql similarity index 100% rename from examples/tables/dev.test_table.sql rename to examples/resources/tables/dev.test_table.sql diff --git a/examples/tables/employees.sql b/examples/resources/tables/employees.sql similarity index 100% rename from examples/tables/employees.sql rename to examples/resources/tables/employees.sql diff --git a/examples/tables/order_details.sql b/examples/resources/tables/order_details.sql similarity index 100% rename from examples/tables/order_details.sql rename to examples/resources/tables/order_details.sql diff --git a/examples/tables/orders.sql b/examples/resources/tables/orders.sql similarity index 100% rename from examples/tables/orders.sql rename to examples/resources/tables/orders.sql diff --git a/examples/tables/products.sql b/examples/resources/tables/products.sql similarity index 100% rename from examples/tables/products.sql rename to examples/resources/tables/products.sql diff --git a/examples/tables/shippers.sql b/examples/resources/tables/shippers.sql similarity index 100% rename from examples/tables/shippers.sql rename to examples/resources/tables/shippers.sql diff --git a/examples/tables/suppliers.sql b/examples/resources/tables/suppliers.sql similarity index 100% rename from examples/tables/suppliers.sql rename to examples/resources/tables/suppliers.sql diff --git a/examples/scripts/custom_extract_local.py b/examples/steps/custom_extract_local.py similarity index 82% rename from examples/scripts/custom_extract_local.py rename to examples/steps/custom_extract_local.py index 4acb4c7..6614af6 100644 --- a/examples/scripts/custom_extract_local.py +++ b/examples/steps/custom_extract_local.py @@ -2,6 +2,8 @@ ETL step wrapper for creating an S3 node for input from local files """ from dataduct.steps import ExtractLocalStep +import logging +logger = logging.getLogger(__name__) class CustomExtractLocalStep(ExtractLocalStep): @@ -14,5 +16,5 @@ def __init__(self, **kwargs): Args: **kwargs(optional): Keyword arguments directly passed to base class """ - print 'Using the Custom Extract Local Step' + logger.info('Using the Custom Extract Local Step') super(CustomExtractLocalStep, self).__init__(**kwargs) diff --git a/setup.py b/setup.py index 9c2017d..138d2c1 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,12 @@ """ -Setup file for installation of the etllib code +Setup file for installation of the dataduct code """ from setuptools import setup from setuptools import find_packages setup( name='dataduct', - version='0.1.0', + version='0.2.0', author='Coursera Inc.', packages=find_packages( exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), @@ -23,6 +23,9 @@ 'pandas', 'psycopg2', 'MySQL-python', + 'pyparsing', + 'testfixtures', + 'sphinx_rtd_theme' ], scripts=['bin/dataduct'], classifiers=[ From 22cc9d6a8051853c2ace42aafdb887bce43b0cc3 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 21:39:48 -0800 Subject: [PATCH 082/175] check and load step --- dataduct/config/__init__.py | 1 + dataduct/config/credentials.py | 77 +++++++++++++++++ dataduct/config/tests/__init__.py | 0 dataduct/config/tests/test_credentials.py | 47 +++++++++++ dataduct/database/parsers/__init__.py | 1 + dataduct/database/parsers/create_table.py | 32 +++++-- .../parsers/tests/test_create_table.py | 25 ++++-- dataduct/database/table.py | 6 ++ dataduct/etl/etl_pipeline.py | 4 + dataduct/steps/__init__.py | 1 + dataduct/steps/create_load_redshift.py | 65 ++++++++++++++ .../scripts/create_load_redshift_runner.py | 84 +++++++++++++++++++ dataduct/utils/constants.py | 20 +++-- .../example_create_and_load_redshift.yaml | 12 +++ requirements.txt | 1 + 15 files changed, 355 insertions(+), 21 deletions(-) create mode 100644 dataduct/config/credentials.py create mode 100644 dataduct/config/tests/__init__.py create mode 100644 dataduct/config/tests/test_credentials.py create mode 100644 dataduct/steps/create_load_redshift.py create mode 100644 dataduct/steps/scripts/create_load_redshift_runner.py create mode 100644 examples/example_create_and_load_redshift.yaml diff --git a/dataduct/config/__init__.py b/dataduct/config/__init__.py index dd24350..548381f 100644 --- a/dataduct/config/__init__.py +++ b/dataduct/config/__init__.py @@ -1,2 +1,3 @@ from .config import Config from .logger_config import logger_configuration +from .credentials import get_aws_credentials diff --git a/dataduct/config/credentials.py b/dataduct/config/credentials.py new file mode 100644 index 0000000..e0937c0 --- /dev/null +++ b/dataduct/config/credentials.py @@ -0,0 +1,77 @@ +"""Credentials utility functions for connecting to various services +""" +import os +import requests +import sys +from ConfigParser import SafeConfigParser + + +def get_aws_credentials_from_iam(): + """Get aws credentials using the IAM api + Note: this script only runs on an EC2 instance with the appropriate + resource roles. For more information, see the following: + http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/\ + AESDG-chapter-instancedata.html + + Returns: + access_key(str): AWS access key + secret_key(str): AWS secret key + token(str): Connection token + """ + url = "http://169.254.169.254/latest/meta-data/iam/security-credentials/" + + # Get role name + r = requests.get(url) + + if not r.ok: + raise Exception("Request failed for url %s." % url) + + # Add role name to url + url += r.content + + # Get access keys + r = requests.get(url) + if not r.ok: + raise Exception("Request failed for url %s." % url) + + json_result = r.json() + return (json_result["AccessKeyId"], + json_result["SecretAccessKey"], + json_result["Token"]) + + +def get_aws_credentials_from_file(filename=None): + """Get the aws from credential files + """ + config = SafeConfigParser() + cred_file = None + if filename is not None and os.path.isfile(filename): + cred_file = filename + elif os.path.isfile('/etc/boto.cfg'): + cred_file = '/etc/boto.cfg' + elif os.path.isfile(os.path.expanduser('~/.boto')): + cred_file = os.path.expanduser('~/.boto') + elif os.path.isfile(os.path.expanduser('~/.aws/credentials')): + cred_file = os.path.expanduser('~/.aws/credentials') + else: + raise Exception("Cannot find a credentials file") + + config.read(cred_file) + aws_access_key_id = config.get('Credentials', + 'aws_access_key_id') + aws_secret_access_key = config.get('Credentials', + 'aws_secret_access_key') + return (aws_access_key_id, aws_secret_access_key, None) + + +def get_aws_credentials(filename=None): + """Get the aws credentials from IAM or files + """ + # try: + # aws_key, aws_secret, token = get_aws_credentials_from_iam() + # except Exception, error: + # sys.stderr.write("Failed to get creds from IAM: %s \n" % error.message) + # aws_key, aws_secret, token = get_aws_credentials_from_file(filename) + aws_key, aws_secret, token = get_aws_credentials_from_file(filename) + + return aws_key, aws_secret, token diff --git a/dataduct/config/tests/__init__.py b/dataduct/config/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dataduct/config/tests/test_credentials.py b/dataduct/config/tests/test_credentials.py new file mode 100644 index 0000000..c7db09f --- /dev/null +++ b/dataduct/config/tests/test_credentials.py @@ -0,0 +1,47 @@ +"""Tests for credentials file +""" +from mock import patch +from nose.tools import eq_ +import json + +from ..credentials import get_aws_credentials_from_iam + +@patch("requests.get") +def test_get_aws_credentials_from_iam(patched_requests_get): + """Test for get credentials from IAM + """ + class MockedReturn: + """Mock request response + """ + def __init__(self, content): + self.content = content + self.ok = True + + def json(self): + """Returns a json for the content + """ + return json.loads(self.content) + + def server_response(url): + """Mocked server responses + """ + if url == "http://169.254.169.254/latest/meta-data/iam/security-credentials/": # NOQA + return MockedReturn("role") + if url == "http://169.254.169.254/latest/meta-data/iam/security-credentials/role": # NOQA + return MockedReturn(""" + { + "Code" : "Success", + "LastUpdated" : "2012-04-26T16:39:16Z", + "Type" : "AWS-HMAC", + "AccessKeyId" : "access_id", + "SecretAccessKey" : "secret_key", + "Token" : "token", + "Expiration" : "2012-04-27T22:39:16Z" + } + """) + + patched_requests_get.side_effect = server_response + access_id, secret_key, token = get_aws_credentials_from_iam() + eq_(access_id, "access_id") + eq_(secret_key, "secret_key") + eq_(token, "token") diff --git a/dataduct/database/parsers/__init__.py b/dataduct/database/parsers/__init__.py index 19119bb..bedba98 100644 --- a/dataduct/database/parsers/__init__.py +++ b/dataduct/database/parsers/__init__.py @@ -9,4 +9,5 @@ from .select_query import parse_column_name from .create_table import parse_create_table +from .create_table import create_exits_clone from .create_view import parse_create_view diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index cbfc6dd..bbcfdb5 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -1,6 +1,6 @@ """Create SQL parser """ -from pyparsing import OneOrMore +from pyparsing import restOfLine from pyparsing import ParseException from pyparsing import ZeroOrMore @@ -40,22 +40,30 @@ def fk_reference(): return _references + fk_table + fk_reference_columns -def get_base_parser(): - """Get a pyparsing parser for a create table statement +def get_definition_start(): + """Get a pyparsing parse for start of the create table statement Returns: table_definition(pyparsing): Parser for create table statements """ - temp_check = temporary_check.setResultsName('temporary') exists_check = existance_check.setResultsName('exists_checks') table_name = _db_name.setResultsName('full_name') # Initial portions of the table definition - def_start = _create + temp_check + _table + table_name + exists_check + def_start = _create + temp_check + _table + exists_check + table_name + return def_start + + +def get_base_parser(): + """Get a pyparsing parser for a create table statement - table_def = def_start + paranthesis_list('raw_fields', def_field) + \ + Returns: + table_definition(pyparsing): Parser for create table statements + """ + table_def = get_definition_start() + \ + paranthesis_list('raw_fields', def_field) + \ get_attributes_parser() return table_def @@ -153,3 +161,15 @@ def parse_create_table(string): raise return table_data + + +def create_exits_clone(string): + """Create a clone of the table statement which has the exists check + """ + parser = get_definition_start() + restOfLine.setResultsName("definition") + result = to_dict(parser.parseString(string)) + template = "CREATE {temp} TABLE IF NOT EXISTS {table_name} {definition}" + return template.format(temp='TEMP' if result['temporary'] else '', + table_name=result['full_name'], + definition=result['definition']) + diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py index fe536c5..d95cec1 100644 --- a/dataduct/database/parsers/tests/test_create_table.py +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -7,6 +7,7 @@ from pyparsing import ParseException from ..create_table import parse_create_table +from ..create_table import create_exits_clone class TestCreateTableStatement(TestCase): @@ -20,18 +21,28 @@ def test_basic(): 'customer_id INTEGER DISTKEY PRIMARY KEY,' +\ 'customer_name VARCHAR(200))' - full_name = 'orders' - temporary = False - exists_checks = False - output = parse_create_table(query) - eq_(output['full_name'], full_name) - eq_(output['temporary'], temporary) - eq_(output['exists_checks'], exists_checks) + eq_(output['full_name'], 'orders') + eq_(output['temporary'], False) + eq_(output['exists_checks'], False) eq_(len(output['constraints']), 0) eq_(len(output['columns']), 2) + @staticmethod + def test_exists_clone(): + """Basic test for create table clone with exists condition + """ + query = 'CREATE TABLE orders (' +\ + 'customer_id INTEGER DISTKEY PRIMARY KEY,' +\ + 'customer_name VARCHAR(200))' + + exists_clone = create_exits_clone(query) + output = parse_create_table(exists_clone) + eq_(output['full_name'], 'orders') + eq_(output['temporary'], False) + eq_(output['exists_checks'], True) + @staticmethod @raises(ParseException) def test_bad_input(): diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 17321cd..463b07a 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -1,6 +1,7 @@ """Script containing the table class object """ from .parsers import parse_create_table +from .parsers import create_exits_clone from .sql import SqlScript from .select_statement import SelectStatement from .column import Column @@ -139,6 +140,11 @@ def temporary_clone_script(self): return SqlScript(sql) + def exists_clone_script(self): + """Sql script to create a exists clone table + """ + return SqlScript(create_exits_clone(self.sql_statement.sql())) + def drop_script(self): """Sql script to drop the table """ diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 92ca871..7c8bb74 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -32,6 +32,7 @@ from ..steps import PrimaryKeyCheckStep from ..steps import CountCheckStep from ..steps import ColumnCheckStep +from ..steps import CreateAndLoadStep from ..s3 import S3File @@ -469,6 +470,9 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'load-redshift': step_class = LoadRedshiftStep + elif step_type == 'create-load-redshift': + step_class = CreateAndLoadStep + elif step_type in self.custom_steps: step_class = self.custom_steps[step_type] diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index fa40907..f14b7a3 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -13,3 +13,4 @@ from .primary_key_check import PrimaryKeyCheckStep from .count_check import CountCheckStep from .column_check import ColumnCheckStep +from .create_load_redshift import CreateAndLoadStep diff --git a/dataduct/steps/create_load_redshift.py b/dataduct/steps/create_load_redshift.py new file mode 100644 index 0000000..35a922a --- /dev/null +++ b/dataduct/steps/create_load_redshift.py @@ -0,0 +1,65 @@ +""" +ETL step wrapper for QA step can be executed on Ec2 resource +""" +import os + +from .transform import TransformStep +from ..database import Table +from ..database import SqlStatement +from ..config import Config +from ..utils import constants as const +from ..utils.helpers import parse_path + +config = Config() + + +class CreateAndLoadStep(TransformStep): + """CreateAndLoad Step class that creates table if needed and loads data + """ + + def __init__(self, id, table_definition, input_node=None, + script_arguments=None, **kwargs): + """Constructor for the CreateAndLoadStep class + + Args: + table_definition(filepath): schema file for the table to be loaded + script_arguments(list of str): list of arguments to the script + **kwargs(optional): Keyword arguments directly passed to base class + """ + with open(parse_path(table_definition)) as f: + table_def_string = f.read() + + table_exists_script = Table( + SqlStatement(table_def_string)).exists_clone_script() + + if isinstance(input_node, dict): + input_paths = [i.path().uri for i in input_node.values()] + else: + input_paths = [input_node.path().uri] + + + if script_arguments is None: + script_arguments = list() + + script_arguments.extend([ + '--table_definition=%s' % table_exists_script.sql(), + '--s3_input_paths'] + input_paths) + + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.CREATE_LOAD_SCRIPT_PATH) + + super(CreateAndLoadStep, self).__init__( + id=id, script=script, input_node=input_node, + script_arguments=script_arguments, **kwargs) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py new file mode 100644 index 0000000..65f0c3e --- /dev/null +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python + +""" +Replacement for the load step to use the redshift COPY activity instead +""" +import argparse +from dataduct.config import get_aws_credentials +from dataduct.data_access import redshift_connection +from dataduct.database import SqlStatement +from dataduct.database import Table + + +def load_redshift(table_definition, input_paths, max_error=0, + replace_invalid_char=None, no_escape=False, gzip=False): + """Load redshift table with the data in the input s3 paths + """ + table_name = Table(SqlStatement(table_definition)).full_name + + # Credentials string + aws_key, aws_secret, token = get_aws_credentials() + creds = "aws_access_key_id=%s;aws_secret_access_key=%s" % ( + aws_key, aws_secret) + if token: + creds += ";token=%s" % token + + delete_statement = 'DELETE FROM %s;' % table_name + error_string = 'MAX ERROR %d' % max_error if max_error > 0 else '' + if replace_invalid_char is not None: + invalid_char_str = 'ACCEPTINVCHARS AS %s' % replace_invalid_char + else: + invalid_char_str = '' + + query = [delete_statement] + + for input_path in input_paths: + statement = ( + "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' " + "DELIMETER '\t' {escape} {gzip} NULL AS 'NULL' TRUNCATECOLUMNS " + "{max_error} {invalid_char_str};" + ).format(table=table_name, + path=input_path, + creds=creds, + escape='ESCAPE' if not no_escape else '', + gzip='GZIP' if gzip else '', + max_error=error_string, + invalid_char_str=invalid_char_str) + query.append(statement) + return ' '.join(query) + + +def main(): + """Main Function + """ + parser = argparse.ArgumentParser() + parser.add_argument('--table_definition', dest='table_definition', + required=True) + parser.add_argument('--max_error', dest='max_error', default=0, type=int) + parser.add_argument('--replace_invalid_char', dest='replace_invalid_char', + default=None) + parser.add_argument('--no_escape', action='store_true', default=False) + parser.add_argument('--gzip', action='store_true', default=False) + parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') + args = parser.parse_args() + print args + + connection = redshift_connection() + cursor = connection.cursor() + + # Create table in redshift, this is safe due to the if exists condition + cursor.execute(args.table_definition) + + # Load data into redshift + load_query = load_redshift(args.table_definition, args.input_paths, + args.max_error, args.replace_invalid_char, + args.no_escape, args.gzip) + + cursor.execute(load_query) + cursor.execute('COMMIT') + cursor.close() + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index 99ea2f8..2fa0667 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -22,11 +22,15 @@ # Step paths SCRIPTS_DIRECTORY = 'scripts' -SCRIPT_RUNNER_PATH = os.path.join(SCRIPTS_DIRECTORY, 'script_runner.py') -DEPENDENCY_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, - 'pipeline_dependency_check.py') -PK_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, 'primary_key_test.py') -COUNT_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, - 'count_check_test.py') -COLUMN_CHECK_SCRIPT_PATH = os.path.join(SCRIPTS_DIRECTORY, - 'column_check_test.py') +SCRIPT_RUNNER_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'script_runner.py') +DEPENDENCY_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'pipeline_dependency_check.py') +PK_CHECK_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'primary_key_test.py') +COUNT_CHECK_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'count_check_test.py') +COLUMN_CHECK_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'column_check_test.py') +CREATE_LOAD_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'create_load_redshift_runner.py') diff --git a/examples/example_create_and_load_redshift.yaml b/examples/example_create_and_load_redshift.yaml new file mode 100644 index 0000000..1be0c4d --- /dev/null +++ b/examples/example_create_and_load_redshift.yaml @@ -0,0 +1,12 @@ +name : example_create_and_load_redshift +frequency : one-time +load_time: 01:00 # Hour:Min in UTC + +description : Example for the load_redshift step + +steps: +- step_type: extract-local + path: data/test_table1.tsv + +- step_type: create-load-redshift + table_definition: tables/dev.test_table.sql diff --git a/requirements.txt b/requirements.txt index 8085d0d..e42b6eb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ coverage pyparsing>=2 pygraphviz testfixtures>=4.1.1 +mock From 9e64ec6d6c8d1ac74fe531b2e73056c69b8efe2c Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 21:51:20 -0800 Subject: [PATCH 083/175] uncomment stuff --- dataduct/config/credentials.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/dataduct/config/credentials.py b/dataduct/config/credentials.py index e0937c0..3c03f0f 100644 --- a/dataduct/config/credentials.py +++ b/dataduct/config/credentials.py @@ -67,11 +67,10 @@ def get_aws_credentials_from_file(filename=None): def get_aws_credentials(filename=None): """Get the aws credentials from IAM or files """ - # try: - # aws_key, aws_secret, token = get_aws_credentials_from_iam() - # except Exception, error: - # sys.stderr.write("Failed to get creds from IAM: %s \n" % error.message) - # aws_key, aws_secret, token = get_aws_credentials_from_file(filename) - aws_key, aws_secret, token = get_aws_credentials_from_file(filename) + try: + aws_key, aws_secret, token = get_aws_credentials_from_iam() + except Exception, error: + sys.stderr.write("Failed to get creds from IAM: %s \n" % error.message) + aws_key, aws_secret, token = get_aws_credentials_from_file(filename) return aws_key, aws_secret, token From 41b0fa42cf4a6bbeaaa36edb9355f6336f54e2bb Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 22:12:40 -0800 Subject: [PATCH 084/175] query typo fix --- dataduct/steps/scripts/create_load_redshift_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index 65f0c3e..3231b0d 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -24,7 +24,7 @@ def load_redshift(table_definition, input_paths, max_error=0, creds += ";token=%s" % token delete_statement = 'DELETE FROM %s;' % table_name - error_string = 'MAX ERROR %d' % max_error if max_error > 0 else '' + error_string = 'MAXERROR %d' % max_error if max_error > 0 else '' if replace_invalid_char is not None: invalid_char_str = 'ACCEPTINVCHARS AS %s' % replace_invalid_char else: @@ -35,7 +35,7 @@ def load_redshift(table_definition, input_paths, max_error=0, for input_path in input_paths: statement = ( "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' " - "DELIMETER '\t' {escape} {gzip} NULL AS 'NULL' TRUNCATECOLUMNS " + "DELIMITER '\t' {escape} {gzip} NULL AS 'NULL' TRUNCATECOLUMNS " "{max_error} {invalid_char_str};" ).format(table=table_name, path=input_path, From 12f421e8f69ea82e18daa1045225b0c7e6c757fb Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 16 Feb 2015 23:07:02 -0800 Subject: [PATCH 085/175] style fix --- dataduct/config/credentials.py | 14 +++++++------- dataduct/config/tests/test_credentials.py | 12 ++++++------ dataduct/database/parsers/create_table.py | 4 ++-- .../database/parsers/tests/test_create_table.py | 12 ++++++------ dataduct/steps/create_load_redshift.py | 3 +-- .../steps/scripts/create_load_redshift_runner.py | 8 ++++---- 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/dataduct/config/credentials.py b/dataduct/config/credentials.py index 3c03f0f..3afa75a 100644 --- a/dataduct/config/credentials.py +++ b/dataduct/config/credentials.py @@ -18,13 +18,13 @@ def get_aws_credentials_from_iam(): secret_key(str): AWS secret key token(str): Connection token """ - url = "http://169.254.169.254/latest/meta-data/iam/security-credentials/" + url = 'http://169.254.169.254/latest/meta-data/iam/security-credentials/' # Get role name r = requests.get(url) if not r.ok: - raise Exception("Request failed for url %s." % url) + raise Exception('Request failed for url %s.' % url) # Add role name to url url += r.content @@ -32,12 +32,12 @@ def get_aws_credentials_from_iam(): # Get access keys r = requests.get(url) if not r.ok: - raise Exception("Request failed for url %s." % url) + raise Exception('Request failed for url %s.' % url) json_result = r.json() - return (json_result["AccessKeyId"], - json_result["SecretAccessKey"], - json_result["Token"]) + return (json_result['AccessKeyId'], + json_result['SecretAccessKey'], + json_result['Token']) def get_aws_credentials_from_file(filename=None): @@ -70,7 +70,7 @@ def get_aws_credentials(filename=None): try: aws_key, aws_secret, token = get_aws_credentials_from_iam() except Exception, error: - sys.stderr.write("Failed to get creds from IAM: %s \n" % error.message) + sys.stderr.write('Failed to get creds from IAM: %s \n' % error.message) aws_key, aws_secret, token = get_aws_credentials_from_file(filename) return aws_key, aws_secret, token diff --git a/dataduct/config/tests/test_credentials.py b/dataduct/config/tests/test_credentials.py index c7db09f..4a96926 100644 --- a/dataduct/config/tests/test_credentials.py +++ b/dataduct/config/tests/test_credentials.py @@ -6,7 +6,7 @@ from ..credentials import get_aws_credentials_from_iam -@patch("requests.get") +@patch('requests.get') def test_get_aws_credentials_from_iam(patched_requests_get): """Test for get credentials from IAM """ @@ -25,9 +25,9 @@ def json(self): def server_response(url): """Mocked server responses """ - if url == "http://169.254.169.254/latest/meta-data/iam/security-credentials/": # NOQA + if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/': # NOQA return MockedReturn("role") - if url == "http://169.254.169.254/latest/meta-data/iam/security-credentials/role": # NOQA + if url == 'http://169.254.169.254/latest/meta-data/iam/security-credentials/role': # NOQA return MockedReturn(""" { "Code" : "Success", @@ -42,6 +42,6 @@ def server_response(url): patched_requests_get.side_effect = server_response access_id, secret_key, token = get_aws_credentials_from_iam() - eq_(access_id, "access_id") - eq_(secret_key, "secret_key") - eq_(token, "token") + eq_(access_id, 'access_id') + eq_(secret_key, 'secret_key') + eq_(token, 'token') diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index bbcfdb5..63961f7 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -166,9 +166,9 @@ def parse_create_table(string): def create_exits_clone(string): """Create a clone of the table statement which has the exists check """ - parser = get_definition_start() + restOfLine.setResultsName("definition") + parser = get_definition_start() + restOfLine.setResultsName('definition') result = to_dict(parser.parseString(string)) - template = "CREATE {temp} TABLE IF NOT EXISTS {table_name} {definition}" + template = 'CREATE {temp} TABLE IF NOT EXISTS {table_name} {definition}' return template.format(temp='TEMP' if result['temporary'] else '', table_name=result['full_name'], definition=result['definition']) diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py index d95cec1..500e30a 100644 --- a/dataduct/database/parsers/tests/test_create_table.py +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -17,9 +17,9 @@ class TestCreateTableStatement(TestCase): def test_basic(): """Basic test for create table """ - query = 'CREATE TABLE orders (' +\ - 'customer_id INTEGER DISTKEY PRIMARY KEY,' +\ - 'customer_name VARCHAR(200))' + query = ('CREATE TABLE orders (' + 'customer_id INTEGER DISTKEY PRIMARY KEY,' + 'customer_name VARCHAR(200))') output = parse_create_table(query) @@ -33,9 +33,9 @@ def test_basic(): def test_exists_clone(): """Basic test for create table clone with exists condition """ - query = 'CREATE TABLE orders (' +\ - 'customer_id INTEGER DISTKEY PRIMARY KEY,' +\ - 'customer_name VARCHAR(200))' + query = ('CREATE TABLE orders (' + 'customer_id INTEGER DISTKEY PRIMARY KEY,' + 'customer_name VARCHAR(200))') exists_clone = create_exits_clone(query) output = parse_create_table(exists_clone) diff --git a/dataduct/steps/create_load_redshift.py b/dataduct/steps/create_load_redshift.py index 35a922a..2e42a5d 100644 --- a/dataduct/steps/create_load_redshift.py +++ b/dataduct/steps/create_load_redshift.py @@ -1,5 +1,4 @@ -""" -ETL step wrapper for QA step can be executed on Ec2 resource +"""ETL step wrapper for QA step can be executed on Ec2 resource """ import os diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index 3231b0d..cbd81da 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -1,8 +1,8 @@ #!/usr/bin/env python +"""Replacement for the load step to use the redshift COPY command instead """ -Replacement for the load step to use the redshift COPY activity instead -""" + import argparse from dataduct.config import get_aws_credentials from dataduct.data_access import redshift_connection @@ -18,10 +18,10 @@ def load_redshift(table_definition, input_paths, max_error=0, # Credentials string aws_key, aws_secret, token = get_aws_credentials() - creds = "aws_access_key_id=%s;aws_secret_access_key=%s" % ( + creds = 'aws_access_key_id=%s;aws_secret_access_key=%s' % ( aws_key, aws_secret) if token: - creds += ";token=%s" % token + creds += ';token=%s' % token delete_statement = 'DELETE FROM %s;' % table_name error_string = 'MAXERROR %d' % max_error if max_error > 0 else '' From 60fc782afecb7a9949daa75724e3e63049b10871 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 17 Feb 2015 02:01:10 -0800 Subject: [PATCH 086/175] upset and reload steps --- dataduct/database/__init__.py | 1 + dataduct/database/table.py | 4 +- dataduct/etl/etl_pipeline.py | 8 +++ dataduct/steps/__init__.py | 2 + dataduct/steps/reload.py | 27 ++++++++ dataduct/steps/upsert.py | 68 +++++++++++++++++++ examples/example_reload.yaml | 16 +++++ examples/example_upsert.yaml | 16 +++++ .../resources/tables/dev.test_table_2.sql | 4 ++ 9 files changed, 144 insertions(+), 2 deletions(-) create mode 100644 dataduct/steps/reload.py create mode 100644 dataduct/steps/upsert.py create mode 100644 examples/example_reload.yaml create mode 100644 examples/example_upsert.yaml create mode 100644 examples/resources/tables/dev.test_table_2.sql diff --git a/dataduct/database/__init__.py b/dataduct/database/__init__.py index 7710f0e..3a2db43 100644 --- a/dataduct/database/__init__.py +++ b/dataduct/database/__init__.py @@ -4,3 +4,4 @@ from .sql import SqlStatement from .table import Table from .view import View +from .history_table import HistoryTable diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 463b07a..5d1e8c4 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -252,7 +252,7 @@ def de_duplication_script(self): # Create a temporary clone from the script temp_table = self.__class__(script) script.append(temp_table.insert_script(self)) - script.append(self.delete_script) + script.append(self.delete_script()) # Pick a random value on multiple primary keys sql = """ @@ -267,7 +267,7 @@ def de_duplication_script(self): WHERE rnk = 1) """.format(table_name=self.full_name, column_names=comma_seperated(column_names), - pk_names=self.primary_key_names, + pk_names=comma_seperated(self.primary_key_names), temp_table=temp_table.full_name) script.append(SqlScript(sql)) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 7c8bb74..29be146 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -33,6 +33,8 @@ from ..steps import CountCheckStep from ..steps import ColumnCheckStep from ..steps import CreateAndLoadStep +from ..steps import UpsertStep +from ..steps import ReloadStep from ..s3 import S3File @@ -473,6 +475,12 @@ def parse_step_args(self, step_type, **kwargs): elif step_type == 'create-load-redshift': step_class = CreateAndLoadStep + elif step_type == 'upsert': + step_class = UpsertStep + + elif step_type == 'reload': + step_class = ReloadStep + elif step_type in self.custom_steps: step_class = self.custom_steps[step_type] diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index f14b7a3..db675e8 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -14,3 +14,5 @@ from .count_check import CountCheckStep from .column_check import ColumnCheckStep from .create_load_redshift import CreateAndLoadStep +from .upsert import UpsertStep +from .reload import ReloadStep diff --git a/dataduct/steps/reload.py b/dataduct/steps/reload.py new file mode 100644 index 0000000..39281ee --- /dev/null +++ b/dataduct/steps/reload.py @@ -0,0 +1,27 @@ +"""ETL step wrapper for Reload SQL script +""" +from .upsert import UpsertStep + + +class ReloadStep(UpsertStep): + """Reload Step class that helps run a step on the emr cluster + """ + + def __init__(self, **kwargs): + """Constructor for the ReloadStep class + + Args: + **kwargs(optional): Keyword arguments directly passed to base class + """ + super(ReloadStep, self).__init__(**kwargs) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args['delete_existing'] = True + return super(ReloadStep, cls).arguments_processor(etl, input_args) diff --git a/dataduct/steps/upsert.py b/dataduct/steps/upsert.py new file mode 100644 index 0000000..f03b87c --- /dev/null +++ b/dataduct/steps/upsert.py @@ -0,0 +1,68 @@ +"""ETL step wrapper for Upsert SQL script +""" +from .etl_step import ETLStep +from ..pipeline import SqlActivity +from ..database import Table +from ..database import SqlScript +from ..database import SelectStatement +from ..database import HistoryTable +from ..s3 import S3File +from ..utils.helpers import parse_path +from ..utils.helpers import exactly_one + + +class UpsertStep(ETLStep): + """Upsert Step class that helps run a step on the emr cluster + """ + + def __init__(self, destination, redshift_database, sql=None, + script=None, source=None, enforce_primary_key=True, + delete_existing=False, history=None, **kwargs): + """Constructor for the UpsertStep class + + Args: + **kwargs(optional): Keyword arguments directly passed to base class + """ + assert exactly_one(sql, source, script), 'One of sql/source/script' + super(UpsertStep, self).__init__(**kwargs) + + # Input formatting + dest = Table(SqlScript(filename=parse_path(destination))) + if source is not None: + source_relation = Table(SqlScript(filename=parse_path(source))) + else: + source_relation = SelectStatement( + SqlScript(sql=sql, filename=script).sql()) + + # Create the destination table if doesn't exist + script = dest.exists_clone_script() + script.append(dest.upsert_script( + source_relation, enforce_primary_key, delete_existing)) + + if history: + hist = HistoryTable(SqlScript( + filename=parse_path(history))) + script.append(hist.update_history_script(dest)) + + self.activity = self.create_pipeline_object( + object_class=SqlActivity, + resource=self.resource, + schedule=self.schedule, + depends_on=self.depends_on, + database=redshift_database, + max_retries=self.max_retries, + script=self.create_script(S3File(text=script.sql()))) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + input_args = cls.pop_inputs(input_args) + step_args = cls.base_arguments_processor(etl, input_args) + step_args['resource'] = etl.ec2_resource + step_args['redshift_database'] = etl.redshift_database + return step_args diff --git a/examples/example_reload.yaml b/examples/example_reload.yaml new file mode 100644 index 0000000..287a7c9 --- /dev/null +++ b/examples/example_reload.yaml @@ -0,0 +1,16 @@ +name : example_reload +frequency : one-time +load_time: 01:00 # Hour:Min in UTC + +description : Example for the reload step + +steps: +- step_type: extract-local + path: data/test_table1.tsv + +- step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +- step_type: reload + source: tables/dev.test_table.sql + destination: tables/dev.test_table_2.sql diff --git a/examples/example_upsert.yaml b/examples/example_upsert.yaml new file mode 100644 index 0000000..b882a86 --- /dev/null +++ b/examples/example_upsert.yaml @@ -0,0 +1,16 @@ +name : example_upsert +frequency : one-time +load_time: 01:00 # Hour:Min in UTC + +description : Example for the upsert step + +steps: +- step_type: extract-local + path: data/test_table1.tsv + +- step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +- step_type: upsert + source: tables/dev.test_table.sql + destination: tables/dev.test_table_2.sql diff --git a/examples/resources/tables/dev.test_table_2.sql b/examples/resources/tables/dev.test_table_2.sql new file mode 100644 index 0000000..81eb90d --- /dev/null +++ b/examples/resources/tables/dev.test_table_2.sql @@ -0,0 +1,4 @@ +CREATE TABLE dev.test_table_2( + id INTEGER PRIMARY KEY, + description VARCHAR(255) +); From 8f2360364057ef8bec45dd13f6cc7c06b5d47f95 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 18 Feb 2015 11:13:00 -0800 Subject: [PATCH 087/175] refactor step class allocation --- dataduct/etl/etl_actions.py | 14 ++-- dataduct/etl/etl_pipeline.py | 132 ++--------------------------------- dataduct/etl/utils.py | 68 ++++++++++++++++++ dataduct/steps/upsert.py | 2 +- 4 files changed, 83 insertions(+), 133 deletions(-) create mode 100644 dataduct/etl/utils.py diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 441a738..708d2fd 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -2,19 +2,20 @@ """ import yaml +from .etl_pipeline import ETLPipeline from ..pipeline import Activity from ..pipeline import MysqlNode from ..pipeline import RedshiftNode from ..pipeline import S3Node -from .etl_pipeline import ETLPipeline from ..utils.exceptions import ETLInputError -URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa - import logging logger = logging.getLogger(__name__) +URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa + + def read_pipeline_definition(file_path): """Function reads the yaml pipeline definitions. @@ -109,14 +110,14 @@ def visualize_pipeline(etl, activities_only=False, filename=None): # Add nodes for all activities for p_object in pipeline_objects: if isinstance(p_object, Activity): - graph.add_node(p_object.id, shape='diamond', color='turquoise', + graph.add_node(p_object.id, shape='rect', color='turquoise', style='filled') if not activities_only: if isinstance(p_object, MysqlNode): - graph.add_node(p_object.id, shape='egg', color='beige', + graph.add_node(p_object.id, shape='oval', color='beige', style='filled') if isinstance(p_object, RedshiftNode): - graph.add_node(p_object.id, shape='egg', color='goldenrod', + graph.add_node(p_object.id, shape='oval', color='goldenrod', style='filled') if isinstance(p_object, S3Node): graph.add_node(p_object.id, shape='folder', color='grey', @@ -153,5 +154,6 @@ def visualize_pipeline(etl, activities_only=False, filename=None): graph.add_edge(dependency.id, p_object.id, color='grey') # Plotting the graph with dot layout + graph.tred() graph.layout(prog='dot') graph.draw(filename) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 29be146..5c2a74f 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -3,8 +3,8 @@ """ from datetime import datetime import yaml -import imp +from .utils import process_steps from ..config import Config from ..pipeline import DefaultObject @@ -17,32 +17,11 @@ from ..pipeline import SNSAlarm from ..pipeline.utils import list_pipelines -from ..steps import ETLStep -from ..steps import EMRJobStep -from ..steps import EMRStreamingStep -from ..steps import ExtractLocalStep -from ..steps import ExtractRdsStep -from ..steps import ExtractRedshiftStep -from ..steps import ExtractS3Step -from ..steps import LoadRedshiftStep -from ..steps import PipelineDependenciesStep -from ..steps import SqlCommandStep -from ..steps import TransformStep -from ..steps import QATransformStep -from ..steps import PrimaryKeyCheckStep -from ..steps import CountCheckStep -from ..steps import ColumnCheckStep -from ..steps import CreateAndLoadStep -from ..steps import UpsertStep -from ..steps import ReloadStep - - from ..s3 import S3File from ..s3 import S3Path from ..s3 import S3LogPath from ..utils.exceptions import ETLInputError -from ..utils.helpers import parse_path from ..utils import constants as const config = Config() @@ -50,6 +29,7 @@ MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) S3_BASE_PATH = config.etl.get('S3_BASE_PATH', const.EMPTY_STR) SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) +NAME_PREFIX = config.etl.get('NAME_PREFIX', const.EMPTY_STR) class ETLPipeline(object): @@ -83,7 +63,7 @@ def __init__(self, name, frequency='one-time', load_hour, load_min = [None, None] # Input variables - self._name = name + self._name = name if not NAME_PREFIX else NAME_PREFIX + '_' + name self.frequency = frequency self.ec2_resource_terminate_after = ec2_resource_terminate_after self.load_hour = load_hour @@ -104,8 +84,6 @@ def __init__(self, name, frequency='one-time', else: self.emr_cluster_config = dict() - self.custom_steps = self.get_custom_steps() - # Pipeline versions self.version_ts = datetime.utcnow() self.version_name = "version_" + \ @@ -391,106 +369,6 @@ def translate_input_nodes(self, input_node): output[value] = self.intermediate_nodes[key] return output - @staticmethod - def get_custom_steps(): - """Fetch the custom steps specified in config - """ - custom_steps = dict() - - for step_def in getattr(config, 'custom_steps', list()): - step_type = step_def['step_type'] - path = parse_path(step_def['file_path'], 'CUSTOM_STEPS_PATH') - - # Load source from the file path provided - step_mod = imp.load_source(step_type, path) - - # Get the step class based on class_name provided - step_class = getattr(step_mod, step_def['class_name']) - - # Check if step_class is of type ETLStep - if not issubclass(step_class, ETLStep): - raise ETLInputError('Step type %s is not of type ETLStep') - - custom_steps[step_type] = step_class - - return custom_steps - - def parse_step_args(self, step_type, **kwargs): - """Parse step arguments from input to correct ETL step types - - Args: - step_type(str): string specifing step_type of the objects - **kwargs: Keyword arguments read from YAML - - Returns: - step_class(ETLStep): Class object for the specific type - step_args(dict): dictionary of step arguments - """ - - if not isinstance(step_type, str): - raise ETLInputError('Step type must be a string') - - if step_type == 'transform': - step_class = TransformStep - - elif step_type == 'qa-transform': - step_class = QATransformStep - - elif step_type == 'extract-s3': - step_class = ExtractS3Step - - elif step_type == 'primary-key-check': - step_class = PrimaryKeyCheckStep - - elif step_type == 'count-check': - step_class = CountCheckStep - - elif step_type == 'column-check': - step_class = ColumnCheckStep - - elif step_type == 'extract-local': - step_class = ExtractLocalStep - - elif step_type == 'extract-rds': - step_class = ExtractRdsStep - - elif step_type == 'extract-redshift': - step_class = ExtractRedshiftStep - - elif step_type == 'sql-command': - step_class = SqlCommandStep - - elif step_type == 'emr-streaming': - step_class = EMRStreamingStep - - elif step_type == 'emr-step': - step_class = EMRJobStep - - elif step_type == 'pipeline-dependencies': - step_class = PipelineDependenciesStep - - elif step_type == 'load-redshift': - step_class = LoadRedshiftStep - - elif step_type == 'create-load-redshift': - step_class = CreateAndLoadStep - - elif step_type == 'upsert': - step_class = UpsertStep - - elif step_type == 'reload': - step_class = ReloadStep - - elif step_type in self.custom_steps: - step_class = self.custom_steps[step_type] - - else: - raise ETLInputError('Step type %s not recogonized' % step_type) - - step_args = step_class.arguments_processor(self, kwargs) - - return step_class, step_args - def add_step(self, step, is_bootstrap=False): """Add a step to the pipeline @@ -527,6 +405,7 @@ def create_steps(self, steps_params, is_bootstrap=False): """ input_node = None steps = [] + steps_params = process_steps(steps_params) for step_param in steps_params: # Assume that the preceding step is the input if not specified @@ -536,7 +415,8 @@ def create_steps(self, steps_params, is_bootstrap=False): step_param['input_node'] = input_node try: - step_class, step_args = self.parse_step_args(**step_param) + step_class = step_param.pop('step_class') + step_args = step_class.arguments_processor(self, step_param) except Exception: print 'Error creating step with params : ', step_param raise diff --git a/dataduct/etl/utils.py b/dataduct/etl/utils.py new file mode 100644 index 0000000..a2709c2 --- /dev/null +++ b/dataduct/etl/utils.py @@ -0,0 +1,68 @@ +"""Utility functions for processing etl steps +""" +import imp +from ..config import Config +from ..steps import * # noqa +from ..utils.helpers import parse_path +from ..utils.exceptions import ETLInputError + + +STEP_CLASSES = { + 'column-check': ColumnCheckStep, + 'count-check': CountCheckStep, + 'create-load-redshift': CreateAndLoadStep, + 'emr-step': EMRJobStep, + 'emr-streaming': EMRStreamingStep, + 'extract-local': ExtractLocalStep, + 'extract-rds': ExtractRdsStep, + 'extract-redshift': ExtractRedshiftStep, + 'extract-s3': ExtractS3Step, + 'load-redshift': LoadRedshiftStep, + 'pipeline-dependencies': PipelineDependenciesStep, + 'primary-key-check': PrimaryKeyCheckStep, + 'qa-transform': QATransformStep, + 'reload': ReloadStep, + 'sql-command': SqlCommandStep, + 'transform': TransformStep, + 'upsert': UpsertStep, +} + + +def get_custom_steps(): + """Fetch the custom steps specified in config + """ + config = Config() + custom_steps = dict() + + for step_def in getattr(config, 'custom_steps', list()): + step_type = step_def['step_type'] + path = parse_path(step_def['file_path'], 'CUSTOM_STEPS_PATH') + + # Load source from the file path provided + step_mod = imp.load_source(step_type, path) + + # Get the step class based on class_name provided + step_class = getattr(step_mod, step_def['class_name']) + + # Check if step_class is of type ETLStep + if not issubclass(step_class, ETLStep): + raise ETLInputError('Step type %s is not of type ETLStep') + + custom_steps[step_type] = step_class + return custom_steps + + +def process_steps(steps_params): + """Format the step parameters by changing step type to step class + """ + step_config = STEP_CLASSES.copy() + step_config.update(get_custom_steps()) + steps = [] + + for step_param in steps_params: + params = step_param.copy() + step_type = params.pop('step_type') + params['step_class'] = step_config[step_type] + steps.append(params) + + return steps diff --git a/dataduct/steps/upsert.py b/dataduct/steps/upsert.py index f03b87c..b24f8a9 100644 --- a/dataduct/steps/upsert.py +++ b/dataduct/steps/upsert.py @@ -61,8 +61,8 @@ def arguments_processor(cls, etl, input_args): etl(ETLPipeline): Pipeline object containing resources and steps step_args(dict): Dictionary of the step arguments for the class """ - input_args = cls.pop_inputs(input_args) step_args = cls.base_arguments_processor(etl, input_args) + cls.pop_inputs(step_args) step_args['resource'] = etl.ec2_resource step_args['redshift_database'] = etl.redshift_database return step_args From 49a57bb34727f982ad179fc29d61ab609a88da67 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 18 Feb 2015 17:15:19 -0800 Subject: [PATCH 088/175] Add ability to use custom instance types --- dataduct/etl/etl_pipeline.py | 5 +++++ examples/example_transform.yaml | 3 ++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 29be146..aeaed54 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -50,6 +50,7 @@ MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) S3_BASE_PATH = config.etl.get('S3_BASE_PATH', const.EMPTY_STR) SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) +INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', const.M1_LARGE) class ETLPipeline(object): @@ -61,6 +62,7 @@ class ETLPipeline(object): """ def __init__(self, name, frequency='one-time', ec2_resource_terminate_after='6 Hours', + ec2_resource_instance_type=INSTANCE_TYPE, delay=0, emr_cluster_config=None, load_time=None, topic_arn=None, max_retries=MAX_RETRIES, bootstrap=None): @@ -70,6 +72,7 @@ def __init__(self, name, frequency='one-time', name (str): Name of the pipeline should be globally unique. frequency (enum): Frequency of the pipeline. Can be ec2_resource_terminate_after (str): Timeout for ec2 resource + ec2_resource_instance_type (str): Instance type for ec2 resource delay(int): Number of days to delay the pipeline by emr_cluster_config(dict): Dictionary for emr config topic_arn(str): sns alert to be used by the pipeline @@ -86,6 +89,7 @@ def __init__(self, name, frequency='one-time', self._name = name self.frequency = frequency self.ec2_resource_terminate_after = ec2_resource_terminate_after + self.ec2_resource_instance_type = ec2_resource_instance_type self.load_hour = load_hour self.load_min = load_min self.delay = delay @@ -283,6 +287,7 @@ def ec2_resource(self): s3_log_dir=self.s3_log_dir, schedule=self.schedule, terminate_after=self.ec2_resource_terminate_after, + instance_type=self.ec2_resource_instance_type, ) self.create_bootstrap_steps(const.EC2_RESOURCE_STR) diff --git a/examples/example_transform.yaml b/examples/example_transform.yaml index a9c3423..b4a2e73 100644 --- a/examples/example_transform.yaml +++ b/examples/example_transform.yaml @@ -1,8 +1,9 @@ name : example_transform frequency : one-time load_time: 01:00 # Hour:Min in UTC +ec2_resource_instance_type: m1.small -description : Example for the transform step +description : Example for the transform step, uses an m1.small instance steps: - step_type: extract-local From c103cfb49f404530ccc15bd42d0d51e20c8233c2 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 18 Feb 2015 17:17:51 -0800 Subject: [PATCH 089/175] Changed description again --- examples/example_transform.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/example_transform.yaml b/examples/example_transform.yaml index b4a2e73..0fa2b32 100644 --- a/examples/example_transform.yaml +++ b/examples/example_transform.yaml @@ -3,7 +3,7 @@ frequency : one-time load_time: 01:00 # Hour:Min in UTC ec2_resource_instance_type: m1.small -description : Example for the transform step, uses an m1.small instance +description : Example for the transform step, uses an m1.small instance instead of the default steps: - step_type: extract-local From e352dd9482e50e09f981e252d0c113d9fa668e95 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 18 Feb 2015 23:01:46 -0800 Subject: [PATCH 090/175] Add custom, more verbose help message --- bin/dataduct | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/bin/dataduct b/bin/dataduct index b5ab723..a4307f0 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -100,15 +100,44 @@ def visualize_database_actions(table_definitions, filename): database.visualize(filename) +class _HelpAction(argparse._HelpAction): + """HelpAction class used to render a custom help message + """ + def __call__(self, parser, namespace, values, option_string=None): + parser.print_help() + print '' + + # retrieve subparsers from parser + subparsers_actions = [ + action for action in parser._actions + if isinstance(action, argparse._SubParsersAction)] + + for subparsers_action in subparsers_actions: + # get all subparsers and print help + for choice, subparser in subparsers_action.choices.items(): + print "Command '{}'".format(choice) + print subparser.format_usage() + + parser.exit() + + def main(): """Main function""" - parser = argparse.ArgumentParser(description='Run Dataduct commands') + parser = argparse.ArgumentParser(description='Run Dataduct commands', + add_help=False) parser.add_argument( '-m', '--mode', default=None, help='Mode to run the pipeline and config overrides to use', ) + # Overwrite default help + parser.add_argument( + '-h', + '--help', + action=_HelpAction, + help='Show this help message and exit', + ) subparsers = parser.add_subparsers(help='Commands', dest='command') # Config parser declaration From 9931de461a3ec6b967f7611067804d727bc8c083 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 18 Feb 2015 23:17:35 -0800 Subject: [PATCH 091/175] QA log to S3 --- dataduct/qa/check.py | 55 +++++++++++++++++++-- dataduct/steps/column_check.py | 5 +- dataduct/steps/count_check.py | 5 +- dataduct/steps/primary_key_check.py | 13 +++-- dataduct/steps/scripts/column_check_test.py | 6 ++- dataduct/steps/scripts/count_check_test.py | 6 ++- dataduct/steps/scripts/primary_key_test.py | 4 +- examples/example_column_check.yaml | 1 + examples/example_count_check.yaml | 1 + examples/example_primary_key_check.yaml | 3 ++ 10 files changed, 87 insertions(+), 12 deletions(-) diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index a422c92..83e7fcc 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -1,8 +1,16 @@ """Base class for QA steps that provides template function for publishing """ from boto.sns import SNSConnection -from ..config import Config +import datetime + from .utils import render_output +from ..config import Config +from ..database import SelectStatement +from ..s3 import S3Path +from ..s3 import S3File +from ..utils.helpers import exactly_one + +QA_TEST_ROW_LENGTH = 8 class Check(object): @@ -84,7 +92,8 @@ def alert_subject(self): """ return "Failure on %s" % self.name - def publish(self, export_func=None): + def publish(self, log_to_s3=False, dest_sql=None, table=None, + path_suffix=None): """Publish the results of the QA test Note: @@ -96,8 +105,8 @@ def publish(self, export_func=None): print self.results print self.summary - if export_func is not None: - export_func(self.export_output) + if log_to_s3: + self.log_output_to_s3(dest_sql, table, path_suffix) if not self.success: if self.alert_func is not None: @@ -105,3 +114,41 @@ def publish(self, export_func=None): self.alert_func(self.summary, self.alert_subject) else: raise Exception(self.alert_subject) + + def log_output_to_s3(self, destination_sql=None, table=None, + path_suffix=None): + """Log the results of the QA test in S3 + """ + assert exactly_one(destination_sql, table), "Needs table or dest_sql" + + if destination_sql is not None: + full_table_name = SelectStatement(destination_sql).dependencies[0] + else: + full_table_name = table + + config = Config() + + schema_name, table_name = full_table_name.split('.', 1) + pipeline_name, _ = self.name.split(".", 1) + timestamp = datetime.utcnow() + + row = [schema_name, table_name, pipeline_name, timestamp] + row.extend(self.export_output) + if len(row) < QA_TEST_ROW_LENGTH: + row.extend(['NULL'] * (QA_TEST_ROW_LENGTH - len(row))) + + # Convert to TSV + string = '\t'.join(map(str, row)) + + # S3 Path computation + qa_test_dir_uri = config.etl.get('S3_BASE_PATH', '') + \ + config.elt.get('QA_LOG_PATH', 'qa') + qa_test_dir_uri += path_suffix if path_suffix else '' + parent_dir = S3Path(uri=qa_test_dir_uri, is_directory=True) + + key = '_'.join(map(str, row)).replace('.', '_').replace(' ', '_') + key += '.tsv' + + qa_tests_path = S3Path(key=key, parent_dir=parent_dir) + qa_tests_file = S3File(text=string, s3_path=qa_tests_path) + qa_tests_file.upload_to_s3() diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py index 99a54fa..7c3fc2f 100644 --- a/dataduct/steps/column_check.py +++ b/dataduct/steps/column_check.py @@ -25,7 +25,7 @@ def __init__(self, id, source_sql, source_host, destination_table_definition=None, destination_sql=None, sql_tail_for_source=None, sample_size=100, tolerance=1.0, script_arguments=None, - **kwargs): + log_to_s3=False, **kwargs): """Constructor for the ColumnCheckStep class Args: @@ -59,6 +59,9 @@ def __init__(self, id, source_sql, source_host, '--source_host=%s' % source_host ]) + if log_to_s3: + script_arguments.append('--log_to_s3') + steps_path = os.path.abspath(os.path.dirname(__file__)) script = os.path.join(steps_path, const.COLUMN_CHECK_SCRIPT_PATH) diff --git a/dataduct/steps/count_check.py b/dataduct/steps/count_check.py index 8a243e5..f78ac88 100644 --- a/dataduct/steps/count_check.py +++ b/dataduct/steps/count_check.py @@ -20,7 +20,7 @@ class CountCheckStep(QATransformStep): def __init__(self, id, source_host, source_sql=None, source_table_name=None, destination_table_name=None, destination_sql=None, - tolerance=1.0, script_arguments=None, + tolerance=1.0, script_arguments=None, log_to_s3=False, **kwargs): """Constructor for the CountCheckStep class @@ -53,6 +53,9 @@ def __init__(self, id, source_host, source_sql=None, source_table_name=None, '--source_host=%s' % source_host ]) + if log_to_s3: + script_arguments.append('--log_to_s3') + steps_path = os.path.abspath(os.path.dirname(__file__)) script = os.path.join(steps_path, const.COUNT_CHECK_SCRIPT_PATH) diff --git a/dataduct/steps/primary_key_check.py b/dataduct/steps/primary_key_check.py index d366654..c0ce7c1 100644 --- a/dataduct/steps/primary_key_check.py +++ b/dataduct/steps/primary_key_check.py @@ -17,7 +17,8 @@ class PrimaryKeyCheckStep(QATransformStep): """PrimaryKeyCheckStep class that checks a table for PK violations """ - def __init__(self, id, table_definition, **kwargs): + def __init__(self, id, table_definition, script_arguments=None, + log_to_s3=False, **kwargs): """Constructor for the PrimaryKeyCheckStep class Args: @@ -27,9 +28,15 @@ def __init__(self, id, table_definition, **kwargs): with open(parse_path(table_definition)) as f: table_def_string = f.read() + if script_arguments is None: + script_arguments = list() + # We initialize the table object to check valid strings - script_arguments = [ - '--table=%s' % Table(SqlStatement(table_def_string)).sql()] + script_arguments.append( + '--table=%s' % Table(SqlStatement(table_def_string)).sql()) + + if log_to_s3: + script_arguments.append('--log_to_s3') steps_path = os.path.abspath(os.path.dirname(__file__)) script = os.path.join(steps_path, const.PK_CHECK_SCRIPT_PATH) diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py index 82f80d3..a90b2fd 100644 --- a/dataduct/steps/scripts/column_check_test.py +++ b/dataduct/steps/scripts/column_check_test.py @@ -100,6 +100,8 @@ def main(): parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default='Check Column') + parser.add_argument('--log_to_s3', action='store_true', default=False) + parser.add_argument('--path_suffix', dest='path_suffix', default=None) args = parser.parse_args() @@ -113,7 +115,9 @@ def main(): name=args.test_name, sns_topic_arn=args.sns_topic_arn, tolerance=args.tolerance) - check.publish() + + check.publish(args.log_to_s3, dest_sql=args.destination_sql, + path_suffix=args.path_suffix) if __name__ == '__main__': diff --git a/dataduct/steps/scripts/count_check_test.py b/dataduct/steps/scripts/count_check_test.py index 49d2ec8..0d45409 100644 --- a/dataduct/steps/scripts/count_check_test.py +++ b/dataduct/steps/scripts/count_check_test.py @@ -64,6 +64,8 @@ def main(): parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default='Check Count') + parser.add_argument('--log_to_s3', action='store_true', default=False) + parser.add_argument('--path_suffix', dest='path_suffix', default=None) args = parser.parse_args() @@ -74,7 +76,9 @@ def main(): name=args.test_name, sns_topic_arn=args.sns_topic_arn, tolerance=args.tolerance) - check.publish() + + check.publish(args.log_to_s3, dest_sql=args.destination_sql, + path_suffix=args.path_suffix) if __name__ == '__main__': diff --git a/dataduct/steps/scripts/primary_key_test.py b/dataduct/steps/scripts/primary_key_test.py index cb36d67..a80cfa9 100644 --- a/dataduct/steps/scripts/primary_key_test.py +++ b/dataduct/steps/scripts/primary_key_test.py @@ -20,6 +20,8 @@ def main(): parser.add_argument('--sns_topic_arn', dest='sns_topic_arn', default=None) parser.add_argument('--test_name', dest='test_name', default="Check Primary Key") + parser.add_argument('--log_to_s3', action='store_true', default=False) + parser.add_argument('--path_suffix', dest='path_suffix', default=None) args = parser.parse_args() @@ -28,7 +30,7 @@ def main(): result = pdsql.read_sql(table.select_duplicates_script().sql(), connection) check = PrimaryKeyCheck(len(result), name=args.test_name, sns_topic_arn=args.sns_topic_arn) - check.publish() + check.publish(args.log_to_s3, table=table, path_suffix=args.path_suffix) connection.close() diff --git a/examples/example_column_check.yaml b/examples/example_column_check.yaml index 55cc30a..abd8921 100644 --- a/examples/example_column_check.yaml +++ b/examples/example_column_check.yaml @@ -11,3 +11,4 @@ steps: destination_sql: "SELECT network_id, network_name FROM prod.networks" sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" sample_size: 10 + log_to_s3: true diff --git a/examples/example_count_check.yaml b/examples/example_count_check.yaml index 5afdbf5..65a02e2 100644 --- a/examples/example_count_check.yaml +++ b/examples/example_count_check.yaml @@ -10,3 +10,4 @@ steps: source_host: maestro destination_sql: "SELECT network_id, network_name FROM prod.networks" tolerance: 2.0 + log_to_s3: true diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml index a482e50..758f01b 100644 --- a/examples/example_primary_key_check.yaml +++ b/examples/example_primary_key_check.yaml @@ -7,3 +7,6 @@ description : Example for the primary-key-check step steps: - step_type: primary-key-check table_definition: tables/dev.test_table.sql + script_arguments: + - --log_to_s3 + - --path_suffix="dba_table_qa_tests" From 4efc1b0c42e1f2125b8bbc3713c8c82aa9eedf07 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 18 Feb 2015 23:00:41 -0800 Subject: [PATCH 092/175] Add tests for etl_actions --- dataduct/etl/etl_actions.py | 4 +- dataduct/etl/etl_pipeline.py | 2 +- dataduct/etl/tests/test_definition_parser.py | 20 ----- dataduct/etl/tests/test_etl_actions.py | 87 ++++++++++++++++++++ dataduct/etl/tests/test_etl_pipeline.py | 62 ++++++++++++++ 5 files changed, 152 insertions(+), 23 deletions(-) delete mode 100644 dataduct/etl/tests/test_definition_parser.py create mode 100644 dataduct/etl/tests/test_etl_actions.py create mode 100644 dataduct/etl/tests/test_etl_pipeline.py diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 441a738..e1de274 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -30,8 +30,8 @@ def read_pipeline_definition(file_path): Raises: ETLInputError: If `file_path` extention is not yaml """ - extention = file_path.split('.').pop() - if extention != 'yaml': + extension = file_path.split('.').pop() + if extension != 'yaml': raise ETLInputError('Pipeline definition should have a yaml extention') with open(file_path) as f: definition = yaml.load(f.read()) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index aeaed54..e367475 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -234,7 +234,7 @@ def _s3_uri(self, data_type): key = [S3_BASE_PATH, data_type, self.name, self.version_name] if self.frequency == 'daily' and \ - data_type in [const.LOG_STR, const.DATA_STR]: + data_type in [const.LOG_STR, const.DATA_STR]: # For repeated loads, include load date key.append("#{format(@scheduledStartTime, 'YYYYMMdd')}") diff --git a/dataduct/etl/tests/test_definition_parser.py b/dataduct/etl/tests/test_definition_parser.py deleted file mode 100644 index 439d587..0000000 --- a/dataduct/etl/tests/test_definition_parser.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env python -""" -Tests for the definition parser functions -""" -import unittest -from nose.tools import raises - -from ..etl_actions import read_pipeline_definition -from ...utils.exceptions import ETLInputError - - -class DefinitionParserTests(unittest.TestCase): - """Tests for the definition parser. - """ - - @raises(ETLInputError) - def test_yaml_extension(self): - """Test if the yaml extension check works correctly - """ - read_pipeline_definition("name.txt") diff --git a/dataduct/etl/tests/test_etl_actions.py b/dataduct/etl/tests/test_etl_actions.py new file mode 100644 index 0000000..742926d --- /dev/null +++ b/dataduct/etl/tests/test_etl_actions.py @@ -0,0 +1,87 @@ +"""Tests for the ETL actions +""" +import os + +import unittest +from testfixtures import TempDirectory +from nose.tools import raises +from nose.tools import eq_ + +from ..etl_actions import read_pipeline_definition +from ..etl_actions import create_pipeline +from ...utils.exceptions import ETLInputError + + +class EtlActionsTests(unittest.TestCase): + """Tests for the ETL actions + """ + + def setUp(self): + """Setup text fixtures + """ + self.load_hour = '01' + self.load_min = '23' + load_time = self.load_hour + ':' + self.load_min + self.test_yaml = '\n'.join([ + 'name: example_load_redshift', + 'frequency: one-time', + 'load_time: ' + load_time, + 'max_retries: 5', + 'description: Example for the load_redshift step', + 'steps:', + '- step_type: extract-local', + ' path: data/test_table1.tsv', + '- step_type: load-redshift', + ' schema: dev', + ' table: test_table', + ]) + # Definition has no description field + self.test_definition = { + 'name': 'example_load_redshift', + 'frequency': 'one-time', + 'load_time': load_time, + 'max_retries': 5, + 'steps': [{ + 'step_type': 'extract-local', + 'path': 'data/test_table1.tsv', + }, { + 'step_type': 'load-redshift', + 'schema': 'dev', + 'table': 'test_table', + }], + } + + @staticmethod + @raises(ETLInputError) + def test_yaml_extension(): + """Test if the yaml extension check works correctly + for read_pipeline_definition + """ + read_pipeline_definition("name.txt") + + def test_read_pipeline_definition(self): + """Test if the pipeline definition is parsed correctly + """ + with TempDirectory() as directory: + directory.write('test_definition.yaml', self.test_yaml) + result = read_pipeline_definition( + os.path.join(directory.path, 'test_definition.yaml')) + eq_(result, self.test_definition) + + def test_create_pipeline(self): + """Test if simple pipeline creation is correct + """ + result = create_pipeline(self.test_definition) + # Check that pipeline properties are accurate + eq_(result.name, self.test_definition['name']) + eq_(result.frequency, self.test_definition['frequency']) + eq_(result.load_hour, int(self.load_hour)) + eq_(result.load_min, int(self.load_min)) + eq_(result.max_retries, self.test_definition['max_retries']) + # Check that steps are created + steps = result.steps + eq_(len(steps), 4) + assert 'bootstrap_ec2' in steps + assert 'ExtractLocalStep0' in steps + assert 'ExtractS3Step0' in steps + assert 'LoadRedshiftStep0' in steps diff --git a/dataduct/etl/tests/test_etl_pipeline.py b/dataduct/etl/tests/test_etl_pipeline.py new file mode 100644 index 0000000..4130dff --- /dev/null +++ b/dataduct/etl/tests/test_etl_pipeline.py @@ -0,0 +1,62 @@ +"""Tests for the ETL Pipeline object +""" +import unittest +from nose.tools import raises +from nose.tools import eq_ + +from ..etl_pipeline import ETLPipeline +from ...utils.exceptions import ETLInputError + + +class EtlPipelineTests(unittest.TestCase): + """Tests for the ETL Pipeline object + """ + + def setUp(self): + """Setup text fixtures + """ + self.default_pipeline = ETLPipeline('test_pipeline') + + @staticmethod + def test_construct_etl_pipeline(): + """Test if the constructor for EtlPipeline is correct + """ + result = ETLPipeline( + 'test_pipeline', + frequency='one-time', + ec2_resource_terminate_after='2 Hours', + ec2_resource_instance_type='m1.small', + delay=13, + emr_cluster_config={'cfg1': 'value'}, + load_time='12:34', + topic_arn='sns:topic-arn:test-case', + max_retries=5, + bootstrap={'cfg1': 'value'}, + ) + eq_(result.name, 'test_pipeline') + eq_(result.frequency, 'one-time') + eq_(result.ec2_resource_terminate_after, '2 Hours') + eq_(result.ec2_resource_instance_type, 'm1.small') + eq_(result.load_hour, 12) + eq_(result.load_min, 34) + eq_(result.delay, 13) + eq_(result.max_retries, 5) + eq_(result.topic_arn, 'sns:topic-arn:test-case') + eq_(result.bootstrap_definitions, {'cfg1': 'value'}) + eq_(result.emr_cluster_config, {'cfg1': 'value'}) + + @staticmethod + def test_no_load_time_default_none(): + """Test if the load_hour and load_min get set to None + if load_time is None + """ + result = ETLPipeline('no_load_time_pipeline', load_time=None) + eq_(result.load_hour, None) + eq_(result.load_min, None) + + @raises(ETLInputError) + def test_bad_data_type_throws(self): + """Test that exception is thrown if the data_type parameter for + _s3_uri is bad + """ + self.default_pipeline._s3_uri('TEST_DATA_TYPE') From 71cd06ec9f901c6600623e76a4592ad7edb958a4 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 19 Feb 2015 00:41:58 -0800 Subject: [PATCH 093/175] Trim steps assert since it appears to depend on config file --- dataduct/etl/tests/test_etl_actions.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dataduct/etl/tests/test_etl_actions.py b/dataduct/etl/tests/test_etl_actions.py index 742926d..30f0387 100644 --- a/dataduct/etl/tests/test_etl_actions.py +++ b/dataduct/etl/tests/test_etl_actions.py @@ -78,10 +78,7 @@ def test_create_pipeline(self): eq_(result.load_hour, int(self.load_hour)) eq_(result.load_min, int(self.load_min)) eq_(result.max_retries, self.test_definition['max_retries']) - # Check that steps are created + # Check that vital steps are created steps = result.steps - eq_(len(steps), 4) - assert 'bootstrap_ec2' in steps assert 'ExtractLocalStep0' in steps - assert 'ExtractS3Step0' in steps assert 'LoadRedshiftStep0' in steps From 12d24d19f869a2ce0dbbf7bc34c0207e5121df14 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 19 Feb 2015 00:44:41 -0800 Subject: [PATCH 094/175] Fix check class --- dataduct/database/parsers/__init__.py | 2 +- dataduct/database/parsers/create_table.py | 2 +- .../database/parsers/tests/test_create_table.py | 4 ++-- dataduct/database/table.py | 4 ++-- dataduct/qa/check.py | 14 ++++++++++---- dataduct/steps/scripts/primary_key_test.py | 3 ++- examples/example_primary_key_check.yaml | 2 +- 7 files changed, 19 insertions(+), 12 deletions(-) diff --git a/dataduct/database/parsers/__init__.py b/dataduct/database/parsers/__init__.py index bedba98..61d5813 100644 --- a/dataduct/database/parsers/__init__.py +++ b/dataduct/database/parsers/__init__.py @@ -9,5 +9,5 @@ from .select_query import parse_column_name from .create_table import parse_create_table -from .create_table import create_exits_clone +from .create_table import create_exists_clone from .create_view import parse_create_view diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index 63961f7..ac11eeb 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -163,7 +163,7 @@ def parse_create_table(string): return table_data -def create_exits_clone(string): +def create_exists_clone(string): """Create a clone of the table statement which has the exists check """ parser = get_definition_start() + restOfLine.setResultsName('definition') diff --git a/dataduct/database/parsers/tests/test_create_table.py b/dataduct/database/parsers/tests/test_create_table.py index 500e30a..bb489f2 100644 --- a/dataduct/database/parsers/tests/test_create_table.py +++ b/dataduct/database/parsers/tests/test_create_table.py @@ -7,7 +7,7 @@ from pyparsing import ParseException from ..create_table import parse_create_table -from ..create_table import create_exits_clone +from ..create_table import create_exists_clone class TestCreateTableStatement(TestCase): @@ -37,7 +37,7 @@ def test_exists_clone(): 'customer_id INTEGER DISTKEY PRIMARY KEY,' 'customer_name VARCHAR(200))') - exists_clone = create_exits_clone(query) + exists_clone = create_exists_clone(query) output = parse_create_table(exists_clone) eq_(output['full_name'], 'orders') eq_(output['temporary'], False) diff --git a/dataduct/database/table.py b/dataduct/database/table.py index 5d1e8c4..ab8b40b 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -1,7 +1,7 @@ """Script containing the table class object """ from .parsers import parse_create_table -from .parsers import create_exits_clone +from .parsers import create_exists_clone from .sql import SqlScript from .select_statement import SelectStatement from .column import Column @@ -143,7 +143,7 @@ def temporary_clone_script(self): def exists_clone_script(self): """Sql script to create a exists clone table """ - return SqlScript(create_exits_clone(self.sql_statement.sql())) + return SqlScript(create_exists_clone(self.sql_statement.sql())) def drop_script(self): """Sql script to drop the table diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index 83e7fcc..eddf26a 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -1,7 +1,8 @@ """Base class for QA steps that provides template function for publishing """ from boto.sns import SNSConnection -import datetime +from datetime import datetime +import os from .utils import render_output from ..config import Config @@ -141,9 +142,14 @@ def log_output_to_s3(self, destination_sql=None, table=None, string = '\t'.join(map(str, row)) # S3 Path computation - qa_test_dir_uri = config.etl.get('S3_BASE_PATH', '') + \ - config.elt.get('QA_LOG_PATH', 'qa') - qa_test_dir_uri += path_suffix if path_suffix else '' + qa_test_dir_uri = os.path.join( + 's3://', + config.etl.get('S3_ETL_BUCKET', ''), + config.etl.get('S3_BASE_PATH', ''), + config.etl.get('QA_LOG_PATH', 'qa'), + path_suffix if path_suffix else '') + print qa_test_dir_uri + parent_dir = S3Path(uri=qa_test_dir_uri, is_directory=True) key = '_'.join(map(str, row)).replace('.', '_').replace(' ', '_') diff --git a/dataduct/steps/scripts/primary_key_test.py b/dataduct/steps/scripts/primary_key_test.py index a80cfa9..58f362a 100644 --- a/dataduct/steps/scripts/primary_key_test.py +++ b/dataduct/steps/scripts/primary_key_test.py @@ -30,7 +30,8 @@ def main(): result = pdsql.read_sql(table.select_duplicates_script().sql(), connection) check = PrimaryKeyCheck(len(result), name=args.test_name, sns_topic_arn=args.sns_topic_arn) - check.publish(args.log_to_s3, table=table, path_suffix=args.path_suffix) + check.publish(args.log_to_s3, table=table.full_name, + path_suffix=args.path_suffix) connection.close() diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml index 758f01b..4b870cd 100644 --- a/examples/example_primary_key_check.yaml +++ b/examples/example_primary_key_check.yaml @@ -9,4 +9,4 @@ steps: table_definition: tables/dev.test_table.sql script_arguments: - --log_to_s3 - - --path_suffix="dba_table_qa_tests" + - --path_suffix=dba_table_qa_tests From 2cbeb7d37a0aa10b55b52295000bbad0fa4a050d Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 19 Feb 2015 07:02:20 -0800 Subject: [PATCH 095/175] Reporting Pipeline Instances --- dataduct/etl/etl_pipeline.py | 39 ++++++++++++++++++++++++++++++++++++ dataduct/pipeline/utils.py | 38 +++++++++++++++++++++++++++++++++++ dataduct/qa/check.py | 1 - 3 files changed, 77 insertions(+), 1 deletion(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 5c2a74f..ad8487c 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -2,6 +2,9 @@ Class definition for DataPipeline """ from datetime import datetime +import csv +import os +from StringIO import StringIO import yaml from .utils import process_steps @@ -16,6 +19,7 @@ from ..pipeline import Schedule from ..pipeline import SNSAlarm from ..pipeline.utils import list_pipelines +from ..pipeline.utils import list_formatted_instance_details from ..s3 import S3File from ..s3 import S3Path @@ -30,6 +34,7 @@ S3_BASE_PATH = config.etl.get('S3_BASE_PATH', const.EMPTY_STR) SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) NAME_PREFIX = config.etl.get('NAME_PREFIX', const.EMPTY_STR) +DP_INSTANCE_LOG_PATH = config.etl.get('DP_INSTANCE_LOG_PATH', const.None) class ETLPipeline(object): @@ -461,6 +466,37 @@ def pipeline_objects(self): result.extend(step.pipeline_objects) return result + @staticmethod + def log_s3_dp_instance_data(pipeline): + """Uploads instance info for dp_instances to S3 + """ + dp_instance_entries = list_formatted_instance_details(pipeline) + if len(dp_instance_entries) > 0: + + output_string = StringIO() + writer = csv.writer(output_string, delimiter='\t') + writer.writerows(dp_instance_entries) + + # S3 Path computation + uri = os.path.join( + 's3://', + config.etl.get('S3_ETL_BUCKET', ''), + config.etl.get('S3_BASE_PATH', ''), + config.etl.get('DP_INSTANCE_LOG_PATH'), + datetime.utcnow().strftime('%Y%m%d')) + + dp_instances_dir = S3Path(uri=uri, is_directory=True) + dp_instances_path = S3Path( + key=pipeline.id + '.tsv', + parent_dir=dp_instances_dir, + ) + dp_instances_file = S3File( + text=output_string.getvalue(), + s3_path=dp_instances_path, + ) + dp_instances_file.upload_to_s3() + output_string.close() + def delete_if_exists(self): """Delete the pipelines with the same name as current pipeline """ @@ -469,6 +505,9 @@ def delete_if_exists(self): for p_iter in list_pipelines(): if p_iter['name'] == self.name: pipeline_instance = DataPipeline(pipeline_id=p_iter['id']) + + if DP_INSTANCE_LOG_PATH: + self.log_s3_dp_instance_data(pipeline_instance) pipeline_instance.delete() def s3_files(self): diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index c5d8db4..315fa3a 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -3,6 +3,12 @@ """ from boto.datapipeline.layer1 import DataPipelineConnection from time import sleep +import dateutil.parser + +DP_ACTUAL_END_TIME = '@actualEndTime' +DP_ATTEMPT_COUNT_KEY = '@attemptCount' +DP_INSTANCE_ID_KEY = 'id' +DP_INSTANCE_STATUS_KEY = '@status' def _update_sleep_time(last_time=None): @@ -152,3 +158,35 @@ def list_pipelines(conn=None): conn.list_pipelines, 'pipelineIdList', ) + + +def date_string(date): + """Normalizes a date string to YYYY-mm-dd HH:MM:SS + """ + if date is None: + return 'NULL' + return str(dateutil.parser.parse(date)) + + +def list_formatted_instance_details(pipeline): + """List of instance rows formatted to match + """ + etl_runs = pipeline.instance_details() + entries = [] + for etl_run_dt in sorted(etl_runs.keys()): + + # Look through instances + for instance in sorted( + etl_runs[etl_run_dt], + key=lambda x: x.get('@actualEndTime', None)): + entries.append( + [ + instance['id'], + pipeline.id, + date_string(etl_run_dt), + date_string(instance.get('@actualEndTime')), + instance['@status'], + instance.get('@attemptCount', 'NULL'), + ] + ) + return entries diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index eddf26a..8024509 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -148,7 +148,6 @@ def log_output_to_s3(self, destination_sql=None, table=None, config.etl.get('S3_BASE_PATH', ''), config.etl.get('QA_LOG_PATH', 'qa'), path_suffix if path_suffix else '') - print qa_test_dir_uri parent_dir = S3Path(uri=qa_test_dir_uri, is_directory=True) From 80bc4f232d8dea5f3f12d0dc7ea3feec342be957 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 19 Feb 2015 07:50:01 -0800 Subject: [PATCH 096/175] grant option --- dataduct/database/relation.py | 7 ++++--- dataduct/etl/etl_pipeline.py | 2 +- .../steps/scripts/create_load_redshift_runner.py | 16 +++++++++------- 3 files changed, 14 insertions(+), 11 deletions(-) diff --git a/dataduct/database/relation.py b/dataduct/database/relation.py index 5024ca8..0c7b29e 100644 --- a/dataduct/database/relation.py +++ b/dataduct/database/relation.py @@ -45,13 +45,14 @@ def _grant_sql_builder(self, permission, user=None, group=None): raise ValueError('Atleast one of user / group needed') result = list() - base = 'GRANT %s ON %s TO ' % (permission, self.full_name) + base = 'GRANT %s ON %s TO {user} WITH GRANT OPTION' % ( + permission, self.full_name) if user is not None: - result.append(base + user) + result.append(base.format(user=user)) if group is not None: - result.append(base + 'GROUP %s' % group) + result.append(base.format(user='GROUP %s' % group)) return result def grant_script(self): diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index ad8487c..6c6926e 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -34,7 +34,7 @@ S3_BASE_PATH = config.etl.get('S3_BASE_PATH', const.EMPTY_STR) SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) NAME_PREFIX = config.etl.get('NAME_PREFIX', const.EMPTY_STR) -DP_INSTANCE_LOG_PATH = config.etl.get('DP_INSTANCE_LOG_PATH', const.None) +DP_INSTANCE_LOG_PATH = config.etl.get('DP_INSTANCE_LOG_PATH', const.NONE) class ETLPipeline(object): diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index cbd81da..7ae487d 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -10,11 +10,11 @@ from dataduct.database import Table -def load_redshift(table_definition, input_paths, max_error=0, +def load_redshift(table, input_paths, max_error=0, replace_invalid_char=None, no_escape=False, gzip=False): """Load redshift table with the data in the input s3 paths """ - table_name = Table(SqlStatement(table_definition)).full_name + table_name = table.full_name # Credentials string aws_key, aws_secret, token = get_aws_credentials() @@ -26,7 +26,7 @@ def load_redshift(table_definition, input_paths, max_error=0, delete_statement = 'DELETE FROM %s;' % table_name error_string = 'MAXERROR %d' % max_error if max_error > 0 else '' if replace_invalid_char is not None: - invalid_char_str = 'ACCEPTINVCHARS AS %s' % replace_invalid_char + invalid_char_str = "ACCEPTINVCHARS AS '%s'" % replace_invalid_char else: invalid_char_str = '' @@ -66,13 +66,15 @@ def main(): connection = redshift_connection() cursor = connection.cursor() + table = Table(SqlStatement(args.table_definition)) + # Create table in redshift, this is safe due to the if exists condition - cursor.execute(args.table_definition) + cursor.execute(table.create_script().sql()) # Load data into redshift - load_query = load_redshift(args.table_definition, args.input_paths, - args.max_error, args.replace_invalid_char, - args.no_escape, args.gzip) + load_query = load_redshift(table, args.input_paths, args.max_error, + args.replace_invalid_char, args.no_escape, + args.gzip) cursor.execute(load_query) cursor.execute('COMMIT') From f26c6e46bd73b2e1fd17b6ad9153777a4f4de0f4 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Tue, 17 Feb 2015 16:38:20 +0900 Subject: [PATCH 097/175] Added .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 3652bb7..68f9a1e 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,6 @@ *.png .coverage + +# pycharm or intellij +.idea/ From fe9a30726df68e92a780993680fc25f2947cda79 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Tue, 17 Feb 2015 16:39:41 +0900 Subject: [PATCH 098/175] Supported region property --- dataduct/pipeline/data_pipeline.py | 8 ++++++-- dataduct/pipeline/utils.py | 10 ++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/dataduct/pipeline/data_pipeline.py b/dataduct/pipeline/data_pipeline.py index 775b575..2686104 100644 --- a/dataduct/pipeline/data_pipeline.py +++ b/dataduct/pipeline/data_pipeline.py @@ -2,13 +2,16 @@ Base class for data pipeline instance """ from collections import defaultdict - +from boto.datapipeline import regions from boto.datapipeline.layer1 import DataPipelineConnection +from dataduct.config import Config from .pipeline_object import PipelineObject from .utils import list_pipeline_instances from ..utils.exceptions import ETLInputError +config = Config() +REGION = config.etl.get('REGION') class DataPipeline(object): """DataPipeline classes with objects and metadata. @@ -29,7 +32,8 @@ def __init__(self, unique_id=None, name=None, pipeline_id=None): Note: If pipelineId is provided we don't need name or unique_id """ - self.conn = DataPipelineConnection() + region = next((x for x in regions() if x.name == str(REGION).lower()), None) + self.conn = DataPipelineConnection(region=region) self.objects = [] if pipeline_id: diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index c5d8db4..9e3e9bd 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -1,9 +1,13 @@ """ Shared utility functions """ +from boto.datapipeline import regions from boto.datapipeline.layer1 import DataPipelineConnection from time import sleep +from dataduct.config import Config +config = Config() +REGION = config.etl.get('REGION') def _update_sleep_time(last_time=None): """Expotentially decay sleep times between calls incase of failures @@ -102,7 +106,8 @@ def list_pipeline_instances(pipeline_id, conn=None, increment=25): instances(list): list of pipeline instances """ if conn is None: - conn = DataPipelineConnection() + region = next((x for x in regions() if x.name == str(REGION).lower()), None) + conn = DataPipelineConnection(region=region) # Get all instances instance_ids = sorted(get_list_from_boto(conn.query_objects, @@ -146,7 +151,8 @@ def list_pipelines(conn=None): pipelines(list): list of pipelines fetched with boto """ if conn is None: - conn = DataPipelineConnection() + region = next((x for x in regions() if x.name == str(REGION).lower()), None) + conn = DataPipelineConnection(region=region) return get_list_from_boto( conn.list_pipelines, From b71645849e713860e05af1e33c732aa0444a0b10 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Fri, 20 Feb 2015 19:45:17 +0900 Subject: [PATCH 099/175] Create get_datapipeline_connection method and set default value of region --- dataduct/pipeline/data_pipeline.py | 10 ++-------- dataduct/pipeline/utils.py | 16 +++++++++++----- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/dataduct/pipeline/data_pipeline.py b/dataduct/pipeline/data_pipeline.py index 2686104..5488ae3 100644 --- a/dataduct/pipeline/data_pipeline.py +++ b/dataduct/pipeline/data_pipeline.py @@ -2,16 +2,11 @@ Base class for data pipeline instance """ from collections import defaultdict -from boto.datapipeline import regions -from boto.datapipeline.layer1 import DataPipelineConnection -from dataduct.config import Config from .pipeline_object import PipelineObject -from .utils import list_pipeline_instances +from .utils import list_pipeline_instances, get_datapipeline_connection from ..utils.exceptions import ETLInputError -config = Config() -REGION = config.etl.get('REGION') class DataPipeline(object): """DataPipeline classes with objects and metadata. @@ -32,8 +27,7 @@ def __init__(self, unique_id=None, name=None, pipeline_id=None): Note: If pipelineId is provided we don't need name or unique_id """ - region = next((x for x in regions() if x.name == str(REGION).lower()), None) - self.conn = DataPipelineConnection(region=region) + self.conn = get_datapipeline_connection() self.objects = [] if pipeline_id: diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index 9e3e9bd..fe31677 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -7,7 +7,8 @@ from dataduct.config import Config config = Config() -REGION = config.etl.get('REGION') +REGION = config.etl.get('REGION', None) + def _update_sleep_time(last_time=None): """Expotentially decay sleep times between calls incase of failures @@ -106,8 +107,7 @@ def list_pipeline_instances(pipeline_id, conn=None, increment=25): instances(list): list of pipeline instances """ if conn is None: - region = next((x for x in regions() if x.name == str(REGION).lower()), None) - conn = DataPipelineConnection(region=region) + get_datapipeline_connection() # Get all instances instance_ids = sorted(get_list_from_boto(conn.query_objects, @@ -141,6 +141,13 @@ def list_pipeline_instances(pipeline_id, conn=None, increment=25): return instances + +def get_datapipeline_connection(): + region = next((x for x in regions() if x.name == str(REGION).lower()), None) + conn = DataPipelineConnection(region=region) + return conn + + def list_pipelines(conn=None): """Fetch a list of all pipelines with boto @@ -151,8 +158,7 @@ def list_pipelines(conn=None): pipelines(list): list of pipelines fetched with boto """ if conn is None: - region = next((x for x in regions() if x.name == str(REGION).lower()), None) - conn = DataPipelineConnection(region=region) + conn = get_datapipeline_connection() return get_list_from_boto( conn.list_pipelines, From 1b0d9daab51675a8e541cbfd1b23a1cf7ff8b2fa Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Fri, 20 Feb 2015 20:04:51 +0900 Subject: [PATCH 100/175] Change to explicit import --- dataduct/pipeline/data_pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataduct/pipeline/data_pipeline.py b/dataduct/pipeline/data_pipeline.py index 5488ae3..6a2ad62 100644 --- a/dataduct/pipeline/data_pipeline.py +++ b/dataduct/pipeline/data_pipeline.py @@ -4,7 +4,8 @@ from collections import defaultdict from .pipeline_object import PipelineObject -from .utils import list_pipeline_instances, get_datapipeline_connection +from .utils import list_pipeline_instances +from .utils import get_datapipeline_connection from ..utils.exceptions import ETLInputError From b6d35d59097290b75107f0eeb06e093f0cfee087 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Fri, 20 Feb 2015 20:05:38 +0900 Subject: [PATCH 101/175] Added comments for docs --- dataduct/pipeline/utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index fe31677..d0ba78d 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -143,6 +143,11 @@ def list_pipeline_instances(pipeline_id, conn=None, increment=25): def get_datapipeline_connection(): + """Get boto connection of AWS data pipeline + + Returns: + DataPipelineConnection: boto connection + """ region = next((x for x in regions() if x.name == str(REGION).lower()), None) conn = DataPipelineConnection(region=region) return conn From 6dd4934f32e4111bd2817082b6cd8776225cbfd2 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 21 Feb 2015 12:50:14 -0800 Subject: [PATCH 102/175] better logging of pipelines --- dataduct/data_access/connection.py | 43 ++++++++++++++--------- dataduct/database/database.py | 3 +- dataduct/database/parsers/create_table.py | 4 ++- dataduct/etl/etl_actions.py | 12 ++++--- dataduct/etl/etl_pipeline.py | 30 +++++++++++----- dataduct/pipeline/default_object.py | 9 ++--- dataduct/pipeline/emr_resource.py | 12 ++++--- dataduct/tests/test_import.py | 1 + 8 files changed, 71 insertions(+), 43 deletions(-) diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py index 742076a..979450a 100644 --- a/dataduct/data_access/connection.py +++ b/dataduct/data_access/connection.py @@ -7,23 +7,26 @@ from ..config import Config from ..utils.helpers import retry +from ..utils.helpers import exactly_one from ..utils.exceptions import ETLConfigError config = Config() @retry(2, 60) -def redshift_connection(**kwargs): +def redshift_connection(redshift_creds=None, **kwargs): """Fetch a psql connection object to redshift """ - if not hasattr(config, 'redshift'): - raise ETLConfigError('Redshift config not found') + if redshift_creds is None: + if not hasattr(config, 'redshift'): + raise ETLConfigError('Redshift config not found') + redshift_creds = config.redshift connection = psycopg2.connect( - host=config.redshift['HOST'], - user=config.redshift['USERNAME'], - password=config.redshift['PASSWORD'], - port=config.redshift['PORT'], - database=config.redshift['DATABASE_NAME'], + host=redshift_creds['HOST'], + user=redshift_creds['USERNAME'], + password=redshift_creds['PASSWORD'], + port=redshift_creds['PORT'], + database=redshift_creds['DATABASE_NAME'], connect_timeout=10, **kwargs ) @@ -31,23 +34,29 @@ def redshift_connection(**kwargs): return connection @retry(2, 60) -def rds_connection(host_name, cursorclass=MySQLdb.cursors.SSCursor, - **kwargs): - """Fetch a psql connection object to redshift +def rds_connection(database_name=None, sql_creds=None, + cursorclass=MySQLdb.cursors.SSCursor, **kwargs): + """Fetch a mysql connection object to rds databases """ - if not hasattr(config, 'mysql'): - raise ETLConfigError('mysql not found in dataduct configs') - if host_name not in config.mysql: - raise ETLConfigError('Config for hostname: %s not found' %host_name) + assert exactly_one(database_name, sql_creds), \ + 'Either database or params needed' + + if sql_creds is None: + if not hasattr(config, 'mysql'): + raise ETLConfigError('mysql not found in dataduct configs') + + if database_name not in config.mysql: + raise ETLConfigError( + 'Config for hostname: %s not found' %database_name) - sql_creds = config.mysql[host_name] + sql_creds = config.mysql[database_name] connection = MySQLdb.connect( host=sql_creds['HOST'], user=sql_creds['USERNAME'], passwd=sql_creds['PASSWORD'], - db=host_name, + db=database_name, charset='utf8', # Necessary for foreign chars cursorclass=cursorclass, **kwargs diff --git a/dataduct/database/database.py b/dataduct/database/database.py index 1c974ae..bd86d72 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -15,6 +15,7 @@ import logging logger = logging.getLogger(__name__) + class Database(object): """Class representing a database """ @@ -115,7 +116,7 @@ def sorted_relations(self): """Topological sort of the relations for dependency management """ if self.has_cycles(): - print 'Warning: database has cycles' + logger.warning('Database has cycles') sorted_relations = [] graph = dict((x.full_name, x.dependencies) for x in self.relations()) diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index ac11eeb..e21869d 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -28,6 +28,8 @@ from .helpers import temporary_check from .helpers import to_dict +import logging +logger = logging.getLogger(__name__) FK_REFERENCE = 'fk_reference' @@ -157,7 +159,7 @@ def parse_create_table(string): get_constraints_parser().parseString(field)) table_data['constraints'].append(constraint) except ParseException: - print '[Error] : ', field + logger.error(field) raise return table_data diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 708d2fd..cc8e771 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -56,7 +56,9 @@ def create_pipeline(definition): # Add the steps to the pipeline object etl.create_steps(steps) - print 'Created pipeline. Name: %s' % etl.name + logger.info('Created pipeline. Name: %s', etl.name) + pipeline = etl.create_pipeline() + logger.debug(yaml.dump(pipeline.aws_format)) return etl @@ -71,7 +73,7 @@ def validate_pipeline(etl, force_overwrite=False): if force_overwrite: etl.delete_if_exists() etl.validate() - print 'Validated pipeline. Id: %s' % etl.pipeline.id + logger.info('Validated pipeline. Id: %s', etl.pipeline.id) def activate_pipeline(etl): @@ -81,9 +83,9 @@ def activate_pipeline(etl): etl(EtlPipeline): pipeline object that needs to be activated """ etl.activate() - print 'Activated pipeline. Id: %s' % etl.pipeline.id - print 'Monitor pipeline here: %s' % \ - URL_TEMPLATE.format(ID=etl.pipeline.id) + logger.info('Activated pipeline. Id: %s', etl.pipeline.id) + logger.info('Monitor pipeline here: %s', + URL_TEMPLATE.format(ID=etl.pipeline.id)) def visualize_pipeline(etl, activities_only=False, filename=None): diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 6c6926e..58eeba5 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -28,6 +28,9 @@ from ..utils.exceptions import ETLInputError from ..utils import constants as const +import logging +logger = logging.getLogger(__name__) + config = Config() S3_ETL_BUCKET = config.etl['S3_ETL_BUCKET'] MAX_RETRIES = config.etl.get('MAX_RETRIES', const.ZERO) @@ -168,6 +171,7 @@ def create_base_objects(self): self.default = self.create_pipeline_object( object_class=DefaultObject, sns=self.sns, + pipeline_log_uri=self.s3_log_dir, ) @property @@ -423,16 +427,14 @@ def create_steps(self, steps_params, is_bootstrap=False): step_class = step_param.pop('step_class') step_args = step_class.arguments_processor(self, step_param) except Exception: - print 'Error creating step with params : ', step_param + logger.error('Error creating step with params : %s', step_param) raise try: step = step_class(**step_args) except Exception: - print "Error creating step of class %s, step_param %s." % ( - str(step_class.__name__), - str(step_args) - ) + logger.error("Error creating step of class %s, step_param %s.", + str(step_class.__name__), str(step_args)) raise # Add the step to the pipeline @@ -521,11 +523,11 @@ def s3_files(self): result.extend(pipeline_object.s3_files) return result - def validate(self): - """Validate the given pipeline definition by creating a pipeline + def create_pipeline(self): + """Create the datapipeline object Returns: - errors(list): list of errors in the pipeline, empty if no errors + definition(string): Return the yaml pipeline definition """ # Create AwsPipeline and add objects to it @@ -533,10 +535,20 @@ def validate(self): for pipeline_object in self.pipeline_objects(): self.pipeline.add_object(pipeline_object) + return self.pipeline + + def validate(self): + """Validate the given pipeline definition by creating a pipeline + + Returns: + errors(list): list of errors in the pipeline, empty if no errors + """ + # Check for errors self.errors = self.pipeline.validate_pipeline_definition() if len(self.errors) > 0: - print '\nThere are errors with your pipeline:\n', self.errors + logger.error('There are errors with your pipeline:\n %s', + self.errors) # Update pipeline definition self.pipeline.update_pipeline_definition() diff --git a/dataduct/pipeline/default_object.py b/dataduct/pipeline/default_object.py index 9c3912f..a42823b 100644 --- a/dataduct/pipeline/default_object.py +++ b/dataduct/pipeline/default_object.py @@ -14,12 +14,8 @@ class DefaultObject(PipelineObject): """Default object added to all pipelines """ - def __init__(self, - id, - sns=None, - scheduleType='cron', - failureAndRerunMode='CASCADE', - **kwargs): + def __init__(self, id, pipeline_log_uri, sns=None, scheduleType='cron', + failureAndRerunMode='CASCADE', **kwargs): """Constructor for the DefaultObject class Args: @@ -39,5 +35,6 @@ def __init__(self, failureAndRerunMode=failureAndRerunMode, role=ROLE, resourceRole=RESOURCE_ROLE, + pipelineLogUri=pipeline_log_uri, onFail=sns ) diff --git a/dataduct/pipeline/emr_resource.py b/dataduct/pipeline/emr_resource.py index 717a24a..bf5118d 100644 --- a/dataduct/pipeline/emr_resource.py +++ b/dataduct/pipeline/emr_resource.py @@ -22,6 +22,9 @@ CLUSTER_AMI = config.emr.get('CLUSTER_AMI', '2.4.7') KEY_PAIR = config.etl.get('KEY_PAIR', const.NONE) +import logging +logger = logging.getLogger(__name__) + class EmrResource(PipelineObject): """EMR Resource class @@ -100,7 +103,8 @@ def __init__(self, if self['taskInstanceType'].find('xlarge') >= 0: if num_task_instances > 10: - print 'Using taskInstanceType: (%s)' % \ - self['taskInstanceType'] - print 'WARNING!!! Are you sure you need', \ - '%s task instances?' % num_task_instances + logger.info('Using taskInstanceType: (%s)', + self['taskInstanceType']) + logger.warning( + 'Are you sure you need %s task instances?', + num_task_instances) diff --git a/dataduct/tests/test_import.py b/dataduct/tests/test_import.py index 786a6df..92e1bcf 100644 --- a/dataduct/tests/test_import.py +++ b/dataduct/tests/test_import.py @@ -2,6 +2,7 @@ """ from unittest import TestCase + class TestImports(TestCase): """Tests for dependencies """ From 3564ada418384b42a05662d8e89b7bd4e99fb067 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 21 Feb 2015 15:06:11 -0800 Subject: [PATCH 103/175] connection configs --- dataduct/data_access/__init__.py | 4 ++- dataduct/data_access/connection.py | 54 +++++++++++++++++++----------- 2 files changed, 37 insertions(+), 21 deletions(-) diff --git a/dataduct/data_access/__init__.py b/dataduct/data_access/__init__.py index a22c888..731c805 100644 --- a/dataduct/data_access/__init__.py +++ b/dataduct/data_access/__init__.py @@ -1,2 +1,4 @@ -from .connection import redshift_connection +from .connection import get_sql_config from .connection import rds_connection +from .connection import get_redshift_config +from .connection import redshift_connection diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py index 979450a..0193543 100644 --- a/dataduct/data_access/connection.py +++ b/dataduct/data_access/connection.py @@ -11,15 +11,23 @@ from ..utils.exceptions import ETLConfigError config = Config() +CONNECTION_RETRIES = config.etl.get('CONNECTION_RETRIES', 2) -@retry(2, 60) + +def get_redshift_config(): + """Get redshift config from config file and return the dictionary + """ + if not hasattr(config, 'redshift'): + raise ETLConfigError('Redshift config not found') + return config.redshift + + +@retry(CONNECTION_RETRIES, 60) def redshift_connection(redshift_creds=None, **kwargs): """Fetch a psql connection object to redshift """ if redshift_creds is None: - if not hasattr(config, 'redshift'): - raise ETLConfigError('Redshift config not found') - redshift_creds = config.redshift + redshift_creds = get_redshift_config() connection = psycopg2.connect( host=redshift_creds['HOST'], @@ -28,12 +36,27 @@ def redshift_connection(redshift_creds=None, **kwargs): port=redshift_creds['PORT'], database=redshift_creds['DATABASE_NAME'], connect_timeout=10, - **kwargs - ) - + **kwargs) return connection -@retry(2, 60) + +def get_sql_config(database_name): + """Get SQL config from config file and return the dictionary + """ + if not hasattr(config, 'mysql'): + raise ETLConfigError('mysql not found in dataduct configs') + + if database_name not in config.mysql: + raise ETLConfigError( + 'Config for hostname: %s not found' %database_name) + + sql_creds = config.mysql[database_name] + sql_creds['DATABASE'] = database_name + + return sql_creds + + +@retry(CONNECTION_RETRIES, 60) def rds_connection(database_name=None, sql_creds=None, cursorclass=MySQLdb.cursors.SSCursor, **kwargs): """Fetch a mysql connection object to rds databases @@ -43,23 +66,14 @@ def rds_connection(database_name=None, sql_creds=None, 'Either database or params needed' if sql_creds is None: - if not hasattr(config, 'mysql'): - raise ETLConfigError('mysql not found in dataduct configs') - - if database_name not in config.mysql: - raise ETLConfigError( - 'Config for hostname: %s not found' %database_name) - - sql_creds = config.mysql[database_name] + sql_creds = get_sql_config(database_name) connection = MySQLdb.connect( host=sql_creds['HOST'], user=sql_creds['USERNAME'], passwd=sql_creds['PASSWORD'], - db=database_name, + db=sql_creds['DATABASE'], charset='utf8', # Necessary for foreign chars cursorclass=cursorclass, - **kwargs - ) - + **kwargs) return connection From 29b6d1caa2e89f65e9fd9ad2d9b8b72cd0f3d52d Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 21 Feb 2015 17:05:38 -0800 Subject: [PATCH 104/175] logger level fix --- dataduct/config/config.py | 8 +++++-- dataduct/config/logger_config.py | 7 ++++-- dataduct/etl/etl_actions.py | 4 +--- dataduct/etl/etl_pipeline.py | 26 ++++++----------------- dataduct/etl/utils.py | 11 +++++----- dataduct/qa/check.py | 10 ++++----- dataduct/utils/helpers.py | 8 +++++++ examples/example_primary_key_check.yaml | 2 +- examples/resources/scripts/s3_profiler.py | 3 +-- 9 files changed, 38 insertions(+), 41 deletions(-) diff --git a/dataduct/config/config.py b/dataduct/config/config.py index 824ac8c..865bea3 100644 --- a/dataduct/config/config.py +++ b/dataduct/config/config.py @@ -59,7 +59,11 @@ def __new__(cls, mode=None): # Override the select fields specified based on mode for key in cls._root_config[mode]: - cls._root_config[key].update(cls._root_config[mode][key]) + if isinstance(cls._root_config[key], dict): + cls._root_config[key].update( + cls._root_config[mode][key]) + else: + cls._root_config[key] = cls._root_config[mode][key] cls._isInstantiated = True cls._root_mode = mode @@ -75,7 +79,7 @@ def __init__(self, mode=None): def __str__(self): """String output for the config object """ - return yaml.dump(self._root_config, default_flow_style=False) + return yaml.dump(self._root_config, default_flow_style=False, indent=4) def raw_config(self): """String formatted config file diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py index 79cd596..67f78a9 100644 --- a/dataduct/config/logger_config.py +++ b/dataduct/config/logger_config.py @@ -22,7 +22,10 @@ def logger_configuration(): log_directory = config.logging.get( 'LOG_DIR', os.path.join(os.path.expanduser(CONFIG_DIR))) file_name = config.logging.get('LOG_FILE', LOG_FILE) - console_level = config.logging.get('DEBUG_LEVEL', logging.WARNING) + console_level = config.logging.get('CONSOLE_DEBUG_LEVEL', + logging.INFO) + file_level = config.logging.get('FILE_DEBUG_LEVEL', + logging.DEBUG) if not os.path.exists(log_directory): os.mkdir(log_directory) @@ -33,7 +36,7 @@ def logger_configuration(): file_handler = RotatingFileHandler(os.path.join(log_directory, file_name), maxBytes=200000, backupCount=10) - file_handler.setLevel(logging.INFO) + file_handler.setLevel(file_level) file_handler.setFormatter(logging.Formatter(FILE_FORMAT_STR, datefmt='%Y-%m-%d %H:%M')) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index cc8e771..a6535a8 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -57,9 +57,6 @@ def create_pipeline(definition): # Add the steps to the pipeline object etl.create_steps(steps) logger.info('Created pipeline. Name: %s', etl.name) - pipeline = etl.create_pipeline() - logger.debug(yaml.dump(pipeline.aws_format)) - return etl @@ -73,6 +70,7 @@ def validate_pipeline(etl, force_overwrite=False): if force_overwrite: etl.delete_if_exists() etl.validate() + logger.debug(yaml.dump(etl.pipeline.aws_format)) logger.info('Validated pipeline. Id: %s', etl.pipeline.id) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 58eeba5..eceea87 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -26,6 +26,7 @@ from ..s3 import S3LogPath from ..utils.exceptions import ETLInputError +from ..utils.helpers import get_s3_base_path from ..utils import constants as const import logging @@ -480,12 +481,9 @@ def log_s3_dp_instance_data(pipeline): writer.writerows(dp_instance_entries) # S3 Path computation - uri = os.path.join( - 's3://', - config.etl.get('S3_ETL_BUCKET', ''), - config.etl.get('S3_BASE_PATH', ''), - config.etl.get('DP_INSTANCE_LOG_PATH'), - datetime.utcnow().strftime('%Y%m%d')) + uri = os.path.join(get_s3_base_path(), + config.etl.get('DP_INSTANCE_LOG_PATH'), + datetime.utcnow().strftime('%Y%m%d')) dp_instances_dir = S3Path(uri=uri, is_directory=True) dp_instances_path = S3Path( @@ -523,27 +521,17 @@ def s3_files(self): result.extend(pipeline_object.s3_files) return result - def create_pipeline(self): - """Create the datapipeline object + def validate(self): + """Validate the given pipeline definition by creating a pipeline Returns: - definition(string): Return the yaml pipeline definition + errors(list): list of errors in the pipeline, empty if no errors """ - # Create AwsPipeline and add objects to it self.pipeline = DataPipeline(self.name) for pipeline_object in self.pipeline_objects(): self.pipeline.add_object(pipeline_object) - return self.pipeline - - def validate(self): - """Validate the given pipeline definition by creating a pipeline - - Returns: - errors(list): list of errors in the pipeline, empty if no errors - """ - # Check for errors self.errors = self.pipeline.validate_pipeline_definition() if len(self.errors) > 0: diff --git a/dataduct/etl/utils.py b/dataduct/etl/utils.py index a2709c2..db3ea00 100644 --- a/dataduct/etl/utils.py +++ b/dataduct/etl/utils.py @@ -6,7 +6,6 @@ from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError - STEP_CLASSES = { 'column-check': ColumnCheckStep, 'count-check': CountCheckStep, @@ -52,17 +51,17 @@ def get_custom_steps(): return custom_steps +STEP_CONFIG = STEP_CLASSES.copy() +STEP_CONFIG.update(get_custom_steps()) + + def process_steps(steps_params): """Format the step parameters by changing step type to step class """ - step_config = STEP_CLASSES.copy() - step_config.update(get_custom_steps()) steps = [] - for step_param in steps_params: params = step_param.copy() step_type = params.pop('step_type') - params['step_class'] = step_config[step_type] + params['step_class'] = STEP_CONFIG[step_type] steps.append(params) - return steps diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index 8024509..c568fb6 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -10,6 +10,7 @@ from ..s3 import S3Path from ..s3 import S3File from ..utils.helpers import exactly_one +from ..utils.helpers import get_s3_base_path QA_TEST_ROW_LENGTH = 8 @@ -142,12 +143,9 @@ def log_output_to_s3(self, destination_sql=None, table=None, string = '\t'.join(map(str, row)) # S3 Path computation - qa_test_dir_uri = os.path.join( - 's3://', - config.etl.get('S3_ETL_BUCKET', ''), - config.etl.get('S3_BASE_PATH', ''), - config.etl.get('QA_LOG_PATH', 'qa'), - path_suffix if path_suffix else '') + qa_test_dir_uri = os.path.join(get_s3_base_path(), + config.etl.get('QA_LOG_PATH', 'qa'), + path_suffix if path_suffix else '') parent_dir = S3Path(uri=qa_test_dir_uri, is_directory=True) diff --git a/dataduct/utils/helpers.py b/dataduct/utils/helpers.py index 49265af..257abbf 100644 --- a/dataduct/utils/helpers.py +++ b/dataduct/utils/helpers.py @@ -137,3 +137,11 @@ def parse_path(path, path_type=RESOURCE_BASE_PATH): # Return the path as is. return path + + +def get_s3_base_path(): + """Get the root S3 path from config + """ + config = Config() + return os.path.join('s3://', config.etl.get('S3_ETL_BUCKET', ''), + config.etl.get('S3_BASE_PATH', '')) diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml index 4b870cd..4b63d84 100644 --- a/examples/example_primary_key_check.yaml +++ b/examples/example_primary_key_check.yaml @@ -7,6 +7,6 @@ description : Example for the primary-key-check step steps: - step_type: primary-key-check table_definition: tables/dev.test_table.sql + log_to_s3: true script_arguments: - - --log_to_s3 - --path_suffix=dba_table_qa_tests diff --git a/examples/resources/scripts/s3_profiler.py b/examples/resources/scripts/s3_profiler.py index 0835d52..767e4a7 100755 --- a/examples/resources/scripts/s3_profiler.py +++ b/examples/resources/scripts/s3_profiler.py @@ -1,6 +1,5 @@ #!/usr/bin/env python -""" -Walk over files in S3 output node and provide basic information about them +"""Walk over files in S3 output node and provide basic information about them """ import argparse From 16734051aa758cdf001c660f3e4cf8a46f2382bc Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 23 Feb 2015 02:22:27 -0800 Subject: [PATCH 105/175] PR comments --- dataduct/config/logger_config.py | 12 +++++++----- dataduct/data_access/connection.py | 2 +- dataduct/etl/etl_pipeline.py | 2 +- dataduct/etl/utils.py | 3 ++- dataduct/pipeline/utils.py | 10 +++++----- dataduct/qa/check.py | 3 ++- 6 files changed, 18 insertions(+), 14 deletions(-) diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py index 67f78a9..d2405e9 100644 --- a/dataduct/config/logger_config.py +++ b/dataduct/config/logger_config.py @@ -21,11 +21,13 @@ def logger_configuration(): if hasattr(config, 'logging'): log_directory = config.logging.get( 'LOG_DIR', os.path.join(os.path.expanduser(CONFIG_DIR))) - file_name = config.logging.get('LOG_FILE', LOG_FILE) - console_level = config.logging.get('CONSOLE_DEBUG_LEVEL', - logging.INFO) - file_level = config.logging.get('FILE_DEBUG_LEVEL', - logging.DEBUG) + file_name = config.logging.get( + 'LOG_FILE', LOG_FILE) + + console_level = config.logging.get( + 'CONSOLE_DEBUG_LEVEL', logging.INFO) + file_level = config.logging.get( + 'FILE_DEBUG_LEVEL', logging.DEBUG) if not os.path.exists(log_directory): os.mkdir(log_directory) diff --git a/dataduct/data_access/connection.py b/dataduct/data_access/connection.py index 0193543..224d26b 100644 --- a/dataduct/data_access/connection.py +++ b/dataduct/data_access/connection.py @@ -48,7 +48,7 @@ def get_sql_config(database_name): if database_name not in config.mysql: raise ETLConfigError( - 'Config for hostname: %s not found' %database_name) + 'Config for hostname: %s not found' % database_name) sql_creds = config.mysql[database_name] sql_creds['DATABASE'] = database_name diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 98105c8..4200253 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -439,7 +439,7 @@ def create_steps(self, steps_params, is_bootstrap=False): try: step = step_class(**step_args) except Exception: - logger.error("Error creating step of class %s, step_param %s.", + logger.error('Error creating step of class %s, step_param %s', str(step_class.__name__), str(step_args)) raise diff --git a/dataduct/etl/utils.py b/dataduct/etl/utils.py index db3ea00..57629f2 100644 --- a/dataduct/etl/utils.py +++ b/dataduct/etl/utils.py @@ -45,7 +45,8 @@ def get_custom_steps(): # Check if step_class is of type ETLStep if not issubclass(step_class, ETLStep): - raise ETLInputError('Step type %s is not of type ETLStep') + raise ETLInputError('Step type %s is not of type ETLStep', + step_class.__name__) custom_steps[step_type] = step_class return custom_steps diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index 970da30..1af1f0e 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -196,15 +196,15 @@ def list_formatted_instance_details(pipeline): # Look through instances for instance in sorted( etl_runs[etl_run_dt], - key=lambda x: x.get('@actualEndTime', None)): + key=lambda x: x.get(DP_ACTUAL_END_TIME, None)): entries.append( [ - instance['id'], + instance[DP_INSTANCE_ID_KEY], pipeline.id, date_string(etl_run_dt), - date_string(instance.get('@actualEndTime')), - instance['@status'], - instance.get('@attemptCount', 'NULL'), + date_string(instance.get(DP_ACTUAL_END_TIME)), + instance[DP_INSTANCE_STATUS_KEY], + instance.get(DP_ATTEMPT_COUNT_KEY, 'NULL'), ] ) return entries diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index c568fb6..039e8b4 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -121,7 +121,8 @@ def log_output_to_s3(self, destination_sql=None, table=None, path_suffix=None): """Log the results of the QA test in S3 """ - assert exactly_one(destination_sql, table), "Needs table or dest_sql" + if not exactly_one(destination_sql, table): + raise Exception('Needs table or destination_sql') if destination_sql is not None: full_table_name = SelectStatement(destination_sql).dependencies[0] From 8f8711a36d66fd9e523c236f7144a39d5a776224 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 23 Feb 2015 02:41:42 -0800 Subject: [PATCH 106/175] Fix exception --- dataduct/etl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/etl/utils.py b/dataduct/etl/utils.py index 57629f2..08f53dd 100644 --- a/dataduct/etl/utils.py +++ b/dataduct/etl/utils.py @@ -45,7 +45,7 @@ def get_custom_steps(): # Check if step_class is of type ETLStep if not issubclass(step_class, ETLStep): - raise ETLInputError('Step type %s is not of type ETLStep', + raise ETLInputError('Step type %s is not of type ETLStep' % step_class.__name__) custom_steps[step_type] = step_class From 1989f024c2dea22959b4a8f98bd6e27942644d91 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Mon, 23 Feb 2015 17:57:10 +0900 Subject: [PATCH 107/175] Fixed output_path parameter in transform step --- dataduct/steps/etl_step.py | 4 ++-- dataduct/steps/transform.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index ed3e252..8d70b2a 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -452,11 +452,11 @@ def pop_inputs(input_args): return input_args @staticmethod - def get_output_s3_path(output_path): + def get_output_s3_path(output_path, is_directory=False): """Create an S3 Path variable based on the output path """ if output_path: - s3_path = S3Path(uri=output_path) + s3_path = S3Path(uri=output_path, is_directory=is_directory) else: s3_path = None return s3_path diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index ff03cf0..9dfd741 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -50,7 +50,7 @@ def __init__(self, # Create output_node based on output_path base_output_node = self.create_s3_data_node( - self.get_output_s3_path(output_path)) + self.get_output_s3_path(output_path, True)) script_arguments = self.translate_arguments(script_arguments) @@ -99,7 +99,7 @@ def __init__(self, self.create_pipeline_object( object_class=ShellCommandActivity, input_node=input_nodes, - output_node=base_output_node, + output_node=self._output, resource=self.resource, schedule=self.schedule, script_uri=script, From 64cb9e9d8054f9d423253ee38ef9133a1915cd79 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Mon, 23 Feb 2015 21:22:40 +0900 Subject: [PATCH 108/175] Changed default value of is_directory in get_output_s3_path method --- dataduct/steps/etl_step.py | 2 +- dataduct/steps/transform.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dataduct/steps/etl_step.py b/dataduct/steps/etl_step.py index 8d70b2a..f14ce29 100644 --- a/dataduct/steps/etl_step.py +++ b/dataduct/steps/etl_step.py @@ -452,7 +452,7 @@ def pop_inputs(input_args): return input_args @staticmethod - def get_output_s3_path(output_path, is_directory=False): + def get_output_s3_path(output_path, is_directory=True): """Create an S3 Path variable based on the output path """ if output_path: diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 9dfd741..8c22057 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -50,7 +50,7 @@ def __init__(self, # Create output_node based on output_path base_output_node = self.create_s3_data_node( - self.get_output_s3_path(output_path, True)) + self.get_output_s3_path(output_path)) script_arguments = self.translate_arguments(script_arguments) From fc10f208b30db032727e00775ed6880a9f5088cc Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Mon, 23 Feb 2015 21:45:44 +0900 Subject: [PATCH 109/175] Reversed hunk, https://github.com/coursera/dataduct/pull/40/files#r25154736 --- dataduct/steps/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 8c22057..ff03cf0 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -99,7 +99,7 @@ def __init__(self, self.create_pipeline_object( object_class=ShellCommandActivity, input_node=input_nodes, - output_node=self._output, + output_node=base_output_node, resource=self.resource, schedule=self.schedule, script_uri=script, From 6dd1a78bba5dd8652c379ea083c1632f1d087dfa Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 23 Feb 2015 20:33:54 -0800 Subject: [PATCH 110/175] fix docstring --- dataduct/steps/create_load_redshift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/steps/create_load_redshift.py b/dataduct/steps/create_load_redshift.py index 2e42a5d..3c6751f 100644 --- a/dataduct/steps/create_load_redshift.py +++ b/dataduct/steps/create_load_redshift.py @@ -1,4 +1,4 @@ -"""ETL step wrapper for QA step can be executed on Ec2 resource +"""ETL step wrapper for loading into redshift with the COPY command """ import os From fb7a4cf2bc8a19c5e31a9f75414315abec9e5ca1 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 24 Feb 2015 00:12:42 -0800 Subject: [PATCH 111/175] Add script override option --- dataduct/steps/column_check.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py index 7c3fc2f..9c25119 100644 --- a/dataduct/steps/column_check.py +++ b/dataduct/steps/column_check.py @@ -22,7 +22,7 @@ class ColumnCheckStep(QATransformStep): """ def __init__(self, id, source_sql, source_host, - destination_table_definition=None, + destination_table_definition=None, script=None, destination_sql=None, sql_tail_for_source=None, sample_size=100, tolerance=1.0, script_arguments=None, log_to_s3=False, **kwargs): @@ -62,8 +62,9 @@ def __init__(self, id, source_sql, source_host, if log_to_s3: script_arguments.append('--log_to_s3') - steps_path = os.path.abspath(os.path.dirname(__file__)) - script = os.path.join(steps_path, const.COLUMN_CHECK_SCRIPT_PATH) + if script is None: + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.COLUMN_CHECK_SCRIPT_PATH) super(ColumnCheckStep, self).__init__( id=id, script=script, script_arguments=script_arguments, **kwargs) From 74ce317e4f150781b4706779b4c42e34cbffd249 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 24 Feb 2015 01:13:47 -0800 Subject: [PATCH 112/175] bug fix in qa sns alerts --- dataduct/database/table.py | 2 +- dataduct/steps/column_check.py | 2 +- dataduct/steps/qa_transform.py | 11 ++++------- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/dataduct/database/table.py b/dataduct/database/table.py index ab8b40b..c95c4b5 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -75,7 +75,7 @@ def update_columns_with_constrains(self): def columns(self): """Unsorted list of columns in the table """ - return self._columns.values() + return sorted(self._columns.values(), key=lambda x: x.position) def column(self, column_name): """Get the column with the given name diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py index 9c25119..82dc8ea 100644 --- a/dataduct/steps/column_check.py +++ b/dataduct/steps/column_check.py @@ -82,7 +82,7 @@ def convert_destination_to_column_sql(destination_table_definition=None, destination_columns = destination_table.columns() primary_key_index, primary_keys = zip(*[ (idx, col.name) - for idx, col in enumerate(destination_columns.columns()) + for idx, col in enumerate(destination_columns) if col.primary]) if len(destination_columns) == len(primary_key_index): diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index d7b1946..fa2fea6 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -26,17 +26,14 @@ def __init__(self, """ if sns_topic_arn is None: - sns_topic_arn = config.etl['SNS_TOPIC_ARN_WARNING'] + sns_topic_arn = config.etl.get('SNS_TOPIC_ARN_WARNING', None) if script_arguments is None: script_arguments = list() - script_arguments.extend( - [ - '--sns_topic_arn=%s' % sns_topic_arn, - '--test_name=%s' % (pipeline_name + "." + id) - ] - ) + script_arguments.append('--test_name=%s' % (pipeline_name + "." + id)) + if sns_topic_arn: + script_arguments.append('--sns_topic_arn=%s' % sns_topic_arn) super(QATransformStep, self).__init__( id=id, From 3f5643a2a60f3830cd23f4d4072ff49edf7192c1 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 24 Feb 2015 10:18:13 -0800 Subject: [PATCH 113/175] Change to count check --- dataduct/qa/count_check.py | 15 +++++++++++---- dataduct/steps/count_check.py | 22 ++++++++++++++++------ 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/dataduct/qa/count_check.py b/dataduct/qa/count_check.py index 7a4437e..c0bd116 100644 --- a/dataduct/qa/count_check.py +++ b/dataduct/qa/count_check.py @@ -26,10 +26,17 @@ def error_rate(self): If there are no values in the source but some in the destination, the error is None """ - if self.source_count > 0: - error_difference = float(self.source_count - self.destination_count) - return abs(error_difference * 100) / self.source_count - elif self.destination_count == 0: + return self.calculate_error_rate(self.source_count, + self.destination_count) + + @staticmethod + def calculate_error_rate(source_count, destination_count): + """Calculate the error rate based on the source and destination counts + """ + if source_count > 0: + error_difference = float(source_count - destination_count) + return abs(error_difference * 100) / source_count + elif destination_count == 0: return 0 else: return None diff --git a/dataduct/steps/count_check.py b/dataduct/steps/count_check.py index f78ac88..0e1c077 100644 --- a/dataduct/steps/count_check.py +++ b/dataduct/steps/count_check.py @@ -6,8 +6,10 @@ from ..config import Config from ..database import SqlScript from ..database import SqlStatement +from ..database import Table from ..utils import constants as const from ..utils.helpers import exactly_one +from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError config = Config() @@ -19,9 +21,9 @@ class CountCheckStep(QATransformStep): """ def __init__(self, id, source_host, source_sql=None, source_table_name=None, - destination_table_name=None, destination_sql=None, - tolerance=1.0, script_arguments=None, log_to_s3=False, - **kwargs): + destination_table_name=None, destination_table_definition=None, + destination_sql=None, tolerance=1.0, script_arguments=None, + log_to_s3=False, script=None, **kwargs): """Constructor for the CountCheckStep class Args: @@ -30,7 +32,8 @@ def __init__(self, id, source_host, source_sql=None, source_table_name=None, **kwargs(optional): Keyword arguments directly passed to base class """ - if not exactly_one(destination_table_name, destination_sql): + if not exactly_one(destination_table_name, destination_sql, + destination_table_definition): raise ETLInputError('One of dest table or dest sql needed') if not exactly_one(source_sql, source_table_name): @@ -39,6 +42,12 @@ def __init__(self, id, source_host, source_sql=None, source_table_name=None, if script_arguments is None: script_arguments = list() + if destination_table_definition is not None: + with open(parse_path(destination_table_definition)) as f: + destination_table_string = f.read() + destination_table = Table(SqlScript(destination_table_string)) + destination_table_name = destination_table.full_name + # Get the EDW column SQL dest_sql = self.convert_destination_to_count_sql( destination_table_name, destination_sql) @@ -56,8 +65,9 @@ def __init__(self, id, source_host, source_sql=None, source_table_name=None, if log_to_s3: script_arguments.append('--log_to_s3') - steps_path = os.path.abspath(os.path.dirname(__file__)) - script = os.path.join(steps_path, const.COUNT_CHECK_SCRIPT_PATH) + if script is None: + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.COUNT_CHECK_SCRIPT_PATH) super(CountCheckStep, self).__init__( id=id, script=script, script_arguments=script_arguments, **kwargs) From 478322a731fd4f32ac44866118bc0aa39c56b343 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 24 Feb 2015 10:23:24 -0800 Subject: [PATCH 114/175] error prompt fix --- dataduct/steps/count_check.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dataduct/steps/count_check.py b/dataduct/steps/count_check.py index 0e1c077..7a792b7 100644 --- a/dataduct/steps/count_check.py +++ b/dataduct/steps/count_check.py @@ -34,10 +34,12 @@ def __init__(self, id, source_host, source_sql=None, source_table_name=None, if not exactly_one(destination_table_name, destination_sql, destination_table_definition): - raise ETLInputError('One of dest table or dest sql needed') + raise ETLInputError( + 'One of dest table name/schema or dest sql needed') if not exactly_one(source_sql, source_table_name): - raise ETLInputError('One of dest table or dest sql needed') + raise ETLInputError( + 'One of source table name or source sql needed') if script_arguments is None: script_arguments = list() From 063cfaa4d0efe529c8189531b038e78264e46064 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 24 Feb 2015 12:58:49 -0800 Subject: [PATCH 115/175] varchar parser fix --- dataduct/database/parsers/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index 3360f4b..d4b3ce5 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -20,7 +20,7 @@ _double = CaselessKeyword('DOUBLE') _boolean = CaselessKeyword('BOOLEAN') _char = CaselessKeyword('CHAR') -_varchar = Combine(CaselessKeyword('VARCHAR') + '(' + Word(nums) + ')') +_varchar = Combine(CaselessKeyword('VARCHAR') + '(' + Word(alphanums) + ')') _date = CaselessKeyword('DATE') _timestamp = CaselessKeyword('TIMESTAMP') From edf7ebe6317efacc9963318661e0b014106aa771 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 25 Feb 2015 13:22:26 -0800 Subject: [PATCH 116/175] Added slack integration hook, but did not hook it up --- dataduct/utils/slack_hook.py | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 dataduct/utils/slack_hook.py diff --git a/dataduct/utils/slack_hook.py b/dataduct/utils/slack_hook.py new file mode 100644 index 0000000..334a0f6 --- /dev/null +++ b/dataduct/utils/slack_hook.py @@ -0,0 +1,33 @@ +"""Action hook for posting a message on slack +""" + +from ..config.config import Config + +import logging +logger = logging.getLogger(__name__) + + +def post_message(message): + """Post a message on a specified slack channel + + Args: + message(str): The message to post + """ + try: + import slack + import slack.chat + config = Config() + slack_config = config.etl['slack'] + slack.api_token = slack_config['api_token'] + slack.chat.post_message(slack_config['channel_name'], + message, + username=slack_config['bot_username']) + except Exception: + logger.info('If you want to post a slack message when you activate a pipeline') # noqa + logger.info('1) Run: pip install pyslack') + logger.info('2) Visit https://api.slack/com/web to generate a token') + logger.info(' and add:') + logger.info(' api_token:') + logger.info(' channel_name:') + logger.info(' bot_username:') + logger.info(' to the etl section of your config file') From 5a83b565c3bec904ccca0e8c31d8488722b8c889 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 25 Feb 2015 14:10:55 -0800 Subject: [PATCH 117/175] Hooked up slack integration --- dataduct/etl/etl_actions.py | 3 +++ dataduct/utils/slack_hook.py | 34 +++++++++++++++++++++++----------- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index bdab57d..8ac54ce 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -8,6 +8,7 @@ from ..pipeline import RedshiftNode from ..pipeline import S3Node from ..utils.exceptions import ETLInputError +from ..utils.slack_hook import post_message import logging logger = logging.getLogger(__name__) @@ -84,6 +85,8 @@ def activate_pipeline(etl): logger.info('Activated pipeline. Id: %s', etl.pipeline.id) logger.info('Monitor pipeline here: %s', URL_TEMPLATE.format(ID=etl.pipeline.id)) + # Post a slack message + post_message('Pipeline started: %s' % etl.name) def visualize_pipeline(etl, activities_only=False, filename=None): diff --git a/dataduct/utils/slack_hook.py b/dataduct/utils/slack_hook.py index 334a0f6..2ace749 100644 --- a/dataduct/utils/slack_hook.py +++ b/dataduct/utils/slack_hook.py @@ -1,7 +1,7 @@ """Action hook for posting a message on slack """ -from ..config.config import Config +from ..config import Config import logging logger = logging.getLogger(__name__) @@ -13,21 +13,33 @@ def post_message(message): Args: message(str): The message to post """ + # If any of these fail, silently skip because the user doesn't know about + # the slack integration or doesn't care try: import slack import slack.chat config = Config() slack_config = config.etl['slack'] + except KeyError: + return + + # If any of these configs fail, output error message and fail because the + # user has misconfigured the slack integration + try: slack.api_token = slack_config['api_token'] slack.chat.post_message(slack_config['channel_name'], message, - username=slack_config['bot_username']) - except Exception: - logger.info('If you want to post a slack message when you activate a pipeline') # noqa - logger.info('1) Run: pip install pyslack') - logger.info('2) Visit https://api.slack/com/web to generate a token') - logger.info(' and add:') - logger.info(' api_token:') - logger.info(' channel_name:') - logger.info(' bot_username:') - logger.info(' to the etl section of your config file') + username=slack_config.get('bot_username', + 'Dataduct')) + except KeyError: + message = ['If you want to post a slack message when you activate a pipeline', # noqa + '1) Run: pip install pyslack', + '2) Visit https://api.slack/com/web to generate a token', + '3) Add:', + ' api_token:', + ' channel_name:', + ' bot_username:', + ' to the etl section of your config file'] + for line in message: + logger.error(line) + raise From f10938455569175513cef6734543a68755588edf Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 25 Feb 2015 15:08:55 -0800 Subject: [PATCH 118/175] Code review changes --- dataduct/etl/etl_actions.py | 4 ++-- dataduct/utils/slack_hook.py | 38 +++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 8ac54ce..49925d6 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -85,8 +85,8 @@ def activate_pipeline(etl): logger.info('Activated pipeline. Id: %s', etl.pipeline.id) logger.info('Monitor pipeline here: %s', URL_TEMPLATE.format(ID=etl.pipeline.id)) - # Post a slack message - post_message('Pipeline started: %s' % etl.name) + # Post a slack message if slack is setup + post_message('{user} started pipeline: `%s`' % etl.name) def visualize_pipeline(etl, activities_only=False, filename=None): diff --git a/dataduct/utils/slack_hook.py b/dataduct/utils/slack_hook.py index 2ace749..9f969a9 100644 --- a/dataduct/utils/slack_hook.py +++ b/dataduct/utils/slack_hook.py @@ -8,38 +8,40 @@ def post_message(message): - """Post a message on a specified slack channel + """Post a message on a specified slack channel. + Will silently skip if there is no etl.slack configuration. + Will print a help message if etl.slack is misconfigured. Args: - message(str): The message to post + message(str): The message to post with templating + {user}: The username as specified in the config file """ - # If any of these fail, silently skip because the user doesn't know about - # the slack integration or doesn't care - try: - import slack - import slack.chat - config = Config() - slack_config = config.etl['slack'] - except KeyError: + + # If there is no slack configuration, silently skip because the user + # doesn't know about slack integration or doesn't care + config = Config() + slack_config = config.etl.get('slack', None) + if slack_config is None: return - # If any of these configs fail, output error message and fail because the - # user has misconfigured the slack integration try: + import slack + import slack.chat slack.api_token = slack_config['api_token'] + user = slack_config.get('username', 'Unknown User') slack.chat.post_message(slack_config['channel_name'], - message, + message.format(user=user), username=slack_config.get('bot_username', 'Dataduct')) - except KeyError: + except Exception: message = ['If you want to post a slack message when you activate a pipeline', # noqa '1) Run: pip install pyslack', '2) Visit https://api.slack/com/web to generate a token', - '3) Add:', + '3) Add ([] denotes optional field):', ' api_token:', ' channel_name:', - ' bot_username:', + ' [username:]', + ' [bot_username:]', ' to the etl section of your config file'] for line in message: - logger.error(line) - raise + logger.info(line) From a53b7d6039b568e5c9b0a11f9487934b128522ef Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 25 Feb 2015 17:07:42 -0800 Subject: [PATCH 119/175] Bug fixes for count check --- dataduct/database/parsers/utils.py | 2 +- dataduct/database/relation.py | 7 ++++--- dataduct/etl/tests/test_etl_actions.py | 2 +- dataduct/etl/tests/test_etl_pipeline.py | 2 +- dataduct/steps/scripts/column_check_test.py | 8 ++++++++ dataduct/steps/scripts/create_load_redshift_runner.py | 1 + dataduct/steps/transform.py | 5 +++++ 7 files changed, 21 insertions(+), 6 deletions(-) diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index d4b3ce5..fdc0d57 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -60,5 +60,5 @@ column_types |= _boolean | _char | _varchar | _date | _timestamp subquery = Combine('(' + ZeroOrMore(CharsNotIn(')')) + ')') -_word = Word(alphanums+"_-. *") +_word = Word(alphanums+"_-. *`") def_field = Combine(OneOrMore(_word | subquery)) diff --git a/dataduct/database/relation.py b/dataduct/database/relation.py index 0c7b29e..204a9ce 100644 --- a/dataduct/database/relation.py +++ b/dataduct/database/relation.py @@ -45,14 +45,15 @@ def _grant_sql_builder(self, permission, user=None, group=None): raise ValueError('Atleast one of user / group needed') result = list() - base = 'GRANT %s ON %s TO {user} WITH GRANT OPTION' % ( + option_string = 'WITH GRANT OPTION' + base = 'GRANT %s ON %s TO {user} {option}' % ( permission, self.full_name) if user is not None: - result.append(base.format(user=user)) + result.append(base.format(user=user, option=option_string)) if group is not None: - result.append(base.format(user='GROUP %s' % group)) + result.append(base.format(user='GROUP %s' % group, option='')) return result def grant_script(self): diff --git a/dataduct/etl/tests/test_etl_actions.py b/dataduct/etl/tests/test_etl_actions.py index 30f0387..557b07c 100644 --- a/dataduct/etl/tests/test_etl_actions.py +++ b/dataduct/etl/tests/test_etl_actions.py @@ -73,7 +73,7 @@ def test_create_pipeline(self): """ result = create_pipeline(self.test_definition) # Check that pipeline properties are accurate - eq_(result.name, self.test_definition['name']) + assert result.name.endswith(self.test_definition['name']) eq_(result.frequency, self.test_definition['frequency']) eq_(result.load_hour, int(self.load_hour)) eq_(result.load_min, int(self.load_min)) diff --git a/dataduct/etl/tests/test_etl_pipeline.py b/dataduct/etl/tests/test_etl_pipeline.py index 4130dff..17ad436 100644 --- a/dataduct/etl/tests/test_etl_pipeline.py +++ b/dataduct/etl/tests/test_etl_pipeline.py @@ -33,7 +33,7 @@ def test_construct_etl_pipeline(): max_retries=5, bootstrap={'cfg1': 'value'}, ) - eq_(result.name, 'test_pipeline') + assert result.name.endswith('test_pipeline') eq_(result.frequency, 'one-time') eq_(result.ec2_resource_terminate_after, '2 Hours') eq_(result.ec2_resource_instance_type, 'm1.small') diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py index a90b2fd..58d0d9d 100644 --- a/dataduct/steps/scripts/column_check_test.py +++ b/dataduct/steps/scripts/column_check_test.py @@ -7,11 +7,15 @@ import argparse import collections import re +import pandas import pandas.io.sql as pdsql from dataduct.data_access import redshift_connection from dataduct.data_access import rds_connection from dataduct.qa import ColumnCheck +pandas.options.display.max_colwidth = 100000 +pandas.options.display.max_rows = 10000 + def _get_source_data(sql, hostname, sample_size): """Gets the DataFrame containing all the rows of the table @@ -75,6 +79,8 @@ def _get_destination_data(sql, primary_keys): sql, ) + print query + data = pdsql.read_sql(query, connection) connection.close() # All columns apart from last are PK columns @@ -108,8 +114,10 @@ def main(): # Open up a connection and read the source and destination tables source_data = _get_source_data(args.source_sql, args.source_host, args.sample_size) + print source_data destination_data = _get_destination_data(args.destination_sql, list(source_data.index)) + print destination_data check = ColumnCheck(source_data, destination_data, name=args.test_name, diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index 7ae487d..f96b05c 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -15,6 +15,7 @@ def load_redshift(table, input_paths, max_error=0, """Load redshift table with the data in the input s3 paths """ table_name = table.full_name + print 'Loading data into %s' % table_name # Credentials string aws_key, aws_secret, token = get_aws_credentials() diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index ff03cf0..832516b 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -12,6 +12,9 @@ from ..utils.exceptions import ETLInputError from ..utils import constants as const +import logging +logger = logging.getLogger(__name__) + SCRIPT_ARGUMENT_TYPE_STRING = 'string' SCRIPT_ARGUMENT_TYPE_SQL = 'sql' @@ -96,6 +99,8 @@ def __init__(self, else: self._output = base_output_node + logger.debug(script_arguments) + self.create_pipeline_object( object_class=ShellCommandActivity, input_node=input_nodes, From 7809edbc4ebf291e5815d75d3cb7a4b5ed54cbc9 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 25 Feb 2015 17:35:28 -0800 Subject: [PATCH 120/175] Implement container builds --- .travis.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 13e79b7..6798f2b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,14 @@ language: python python: - 2.7 +cache: pip +sudo: false + +addons: + apt_packages: + - graphviz # command to install dependencies install: - - sudo apt-get install graphviz - pip install coveralls - pip install -r requirements.txt From 111186e1473c4fb802158e2789f51429d3299f19 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 25 Feb 2015 23:45:30 -0800 Subject: [PATCH 121/175] Remove travis caching --- .travis.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 6798f2b..a58a195 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,7 +2,6 @@ language: python python: - 2.7 -cache: pip sudo: false addons: From eb8ce83d5b04d3544db5ff08e691b6db5c78a740 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 26 Feb 2015 11:47:47 -0800 Subject: [PATCH 122/175] Add resource tagging to pipelines --- bin/dataduct | 3 ++- dataduct/etl/etl_actions.py | 2 -- dataduct/etl/etl_pipeline.py | 27 +++++++++++++++++++++++++-- dataduct/pipeline/data_pipeline.py | 25 +++++++++++++++++++++++-- dataduct/pipeline/schedule.py | 1 + 5 files changed, 51 insertions(+), 7 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index a4307f0..ac18e85 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -23,6 +23,7 @@ DATABASE_COMMAND = 'database' PIPELINE_COMMAND = 'pipeline' VISUALIZE_COMMAND = 'visualize' +DEV = 'dev' def config_actions(action, filename): """Config related actions are executed in this block @@ -249,7 +250,7 @@ def main(): mode = args.mode if mode is not None: # We assume mode:dev = mode:None - if mode == 'dev': + if mode == DEV: mode = None # To instantiate the singleton object with the correct state diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 49925d6..4264701 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -41,8 +41,6 @@ def read_pipeline_definition(file_path): # remove the variables key from the pipeline definition # http://stackoverflow.com/questions/4150782/using-yaml-with-variables definition.pop('variables', None) - definition.pop('description', None) - return definition diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 4200253..53d7ab8 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -54,7 +54,7 @@ def __init__(self, name, frequency='one-time', ec2_resource_instance_type=INSTANCE_TYPE, delay=0, emr_cluster_config=None, load_time=None, topic_arn=None, max_retries=MAX_RETRIES, - bootstrap=None): + bootstrap=None, description=None): """Constructor for the pipeline class Args: @@ -82,6 +82,7 @@ def __init__(self, name, frequency='one-time', self.load_hour = load_hour self.load_min = load_min self.delay = delay + self.description = description self.max_retries = max_retries self.topic_arn = topic_arn @@ -526,6 +527,25 @@ def s3_files(self): result.extend(pipeline_object.s3_files) return result + def get_tags(self): + """Get all the pipeline tags that are specified in the config + """ + tag_config = config.etl.get('TAGS', None) + if tag_config is None: + return None + + tags = [] + for key, value in tag_config.iteritems(): + if 'string' in value and 'variable' in value: + raise ETLInputError( + 'Tag config can not have both string and variable') + elif 'string' in value: + tags.append({'key': key, 'value': value['string']}) + elif 'variable' in value: + variable = getattr(self, value['variable']) + tags.append({'key': key, 'value': variable}) + return tags + def validate(self): """Validate the given pipeline definition by creating a pipeline @@ -533,7 +553,10 @@ def validate(self): errors(list): list of errors in the pipeline, empty if no errors """ # Create AwsPipeline and add objects to it - self.pipeline = DataPipeline(self.name) + self.pipeline = DataPipeline(unique_id=self.name, + description=self.description, + tags=self.get_tags()) + for pipeline_object in self.pipeline_objects(): self.pipeline.add_object(pipeline_object) diff --git a/dataduct/pipeline/data_pipeline.py b/dataduct/pipeline/data_pipeline.py index 6a2ad62..acc5488 100644 --- a/dataduct/pipeline/data_pipeline.py +++ b/dataduct/pipeline/data_pipeline.py @@ -1,6 +1,7 @@ """ Base class for data pipeline instance """ +import json from collections import defaultdict from .pipeline_object import PipelineObject @@ -17,7 +18,8 @@ class DataPipeline(object): executing it. """ - def __init__(self, unique_id=None, name=None, pipeline_id=None): + def __init__(self, unique_id=None, name=None, pipeline_id=None, + tags=None, description=None): """Constructor for the datapipeline object Args: @@ -43,7 +45,8 @@ def __init__(self, unique_id=None, name=None, pipeline_id=None): if not name: name = unique_id - response = self.conn.create_pipeline(name, unique_id) + response = self.custom_create_pipeline( + name, unique_id, description, tags) self.pipeline_id = response['pipelineId'] @property @@ -113,3 +116,21 @@ def instance_details(self): for instance in instances: result[instance['@scheduledStartTime']].append(instance) return result + + def custom_create_pipeline(self, name, unique_id, description=None, + tags=None): + """ + Creates a new empty pipeline. Adds tags feature not yet available in + boto + + Args: + tags(list(dict)): a list of tags in the format + [{key: foo, value: bar}] + """ + params = {'name': name, 'uniqueId': unique_id, } + if description is not None: + params['description'] = description + if tags is not None: + params['tags'] = tags + return self.conn.make_request(action='CreatePipeline', + body=json.dumps(params)) diff --git a/dataduct/pipeline/schedule.py b/dataduct/pipeline/schedule.py index 35533e4..b50c260 100644 --- a/dataduct/pipeline/schedule.py +++ b/dataduct/pipeline/schedule.py @@ -14,6 +14,7 @@ FEQUENCY_PERIOD_CONVERTION = { + 'weekly': ('1 week', None), 'daily': ('1 day', None), 'hourly': ('1 hour', None), 'one-time': ('15 minutes', 1), From 932afbbac94e43c3b40e468e0cf630d12c196c0c Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 26 Feb 2015 12:30:45 -0800 Subject: [PATCH 123/175] Fix tests --- dataduct/etl/tests/test_etl_actions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataduct/etl/tests/test_etl_actions.py b/dataduct/etl/tests/test_etl_actions.py index 557b07c..8b108eb 100644 --- a/dataduct/etl/tests/test_etl_actions.py +++ b/dataduct/etl/tests/test_etl_actions.py @@ -39,6 +39,7 @@ def setUp(self): self.test_definition = { 'name': 'example_load_redshift', 'frequency': 'one-time', + 'description': 'Example for the load_redshift step', 'load_time': load_time, 'max_retries': 5, 'steps': [{ From 139d712d4d5994a60b241be2f2593f52b7ad95a2 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 26 Feb 2015 14:24:15 -0800 Subject: [PATCH 124/175] Fixing the log uri --- dataduct/etl/etl_pipeline.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 53d7ab8..67a1c5b 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -222,11 +222,9 @@ def _s3_uri(self, data_type): # Versioning prevents using data from older versions key = [S3_BASE_PATH, data_type, self.name, self.version_name] - if self.frequency == 'daily' and \ - data_type in [const.LOG_STR, const.DATA_STR]: - + if self.frequency == 'daily' and data_type == const.DATA_STR: # For repeated loads, include load date - key.append("#{format(@scheduledStartTime, 'YYYYMMdd')}") + key.append("#{format(@scheduledStartTime, 'YYYYMMdd-hh-mm-ss')}") if data_type == const.LOG_STR: return S3LogPath(key, bucket=S3_ETL_BUCKET, is_directory=True) From f5442ef552e55898a0a921a4e536e41b94ce49fc Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 27 Feb 2015 05:00:32 -0800 Subject: [PATCH 125/175] Bug squashing --- dataduct/etl/etl_pipeline.py | 14 ++++---------- dataduct/steps/count_check.py | 13 ++++++++----- dataduct/steps/transform.py | 1 + 3 files changed, 13 insertions(+), 15 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 67a1c5b..32b62fc 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -294,17 +294,11 @@ def emr_cluster(self): # Process the boostrap input bootstrap = self.emr_cluster_config.get('bootstrap', None) if bootstrap: - if isinstance(bootstrap, dict): - # If bootstrap script is not a path to local file - param_type = bootstrap['type'] - bootstrap = bootstrap['value'] - else: - # Default the type to path of a local file - param_type = 'path' - - if param_type == 'path': - bootstrap = S3File(path=bootstrap) + if 'string' in bootstrap: + bootstrap = bootstrap['string'] + elif 'script' in bootstrap: # Set the S3 Path for the bootstrap script + bootstrap = S3File(path=bootstrap) bootstrap.s3_path = self.s3_source_dir self.emr_cluster_config['bootstrap'] = bootstrap diff --git a/dataduct/steps/count_check.py b/dataduct/steps/count_check.py index 7a792b7..1977a05 100644 --- a/dataduct/steps/count_check.py +++ b/dataduct/steps/count_check.py @@ -23,7 +23,7 @@ class CountCheckStep(QATransformStep): def __init__(self, id, source_host, source_sql=None, source_table_name=None, destination_table_name=None, destination_table_definition=None, destination_sql=None, tolerance=1.0, script_arguments=None, - log_to_s3=False, script=None, **kwargs): + log_to_s3=False, script=None, source_count_sql=None, **kwargs): """Constructor for the CountCheckStep class Args: @@ -37,9 +37,9 @@ def __init__(self, id, source_host, source_sql=None, source_table_name=None, raise ETLInputError( 'One of dest table name/schema or dest sql needed') - if not exactly_one(source_sql, source_table_name): + if not exactly_one(source_sql, source_table_name, source_count_sql): raise ETLInputError( - 'One of source table name or source sql needed') + 'One of source table name or source sql or source count needed') if script_arguments is None: script_arguments = list() @@ -55,7 +55,7 @@ def __init__(self, id, source_host, source_sql=None, source_table_name=None, destination_table_name, destination_sql) src_sql = self.convert_source_to_count_sql( - source_table_name, source_sql) + source_table_name, source_sql, source_count_sql) script_arguments.extend([ '--tolerance=%s' % str(tolerance), @@ -89,11 +89,14 @@ def convert_destination_to_count_sql(destination_table_name=None, @staticmethod def convert_source_to_count_sql(source_table_name=None, - source_sql=None): + source_sql=None, + source_count_sql=None): """Convert the source query into generic structure to compare """ if source_table_name is not None: source_sql = "SELECT COUNT(1) FROM %s" % source_table_name + elif source_count_sql is not None: + source_sql = source_count_sql else: origin_sql = SqlStatement(source_sql) source_sql = "SELECT COUNT(1) FROM (%s)a" % origin_sql.sql() diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 832516b..5537fce 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -99,6 +99,7 @@ def __init__(self, else: self._output = base_output_node + logger.debug('Script Arguments:') logger.debug(script_arguments) self.create_pipeline_object( From e3d0dd271b20a787c85e10c8262f6dd4b2c14789 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 27 Feb 2015 15:34:50 -0800 Subject: [PATCH 126/175] Column nested delimited string supression fix --- dataduct/database/parsers/select_query.py | 16 ++++- .../parsers/tests/test_select_query.py | 61 +++++++++++++++++++ dataduct/database/parsers/utils.py | 13 ++-- 3 files changed, 83 insertions(+), 7 deletions(-) create mode 100644 dataduct/database/parsers/tests/test_select_query.py diff --git a/dataduct/database/parsers/select_query.py b/dataduct/database/parsers/select_query.py index ee87b84..7f6d5ac 100644 --- a/dataduct/database/parsers/select_query.py +++ b/dataduct/database/parsers/select_query.py @@ -6,6 +6,7 @@ from pyparsing import restOfLine from pyparsing import Word from pyparsing import WordStart +from pyparsing import ParseException from .utils import _db_name from .utils import _from @@ -14,6 +15,14 @@ from .utils import def_field +def deduplicate_with_order(seq): + """Deduplicate a sequence while preserving the order + """ + seen = set() + seen_add = seen.add + return [x for x in seq if not (x in seen or seen_add(x))] + + def parse_select_base(string): """Parse a select query and return the dependencies @@ -54,7 +63,11 @@ def parse_select_dependencies(string): flattened_output = [item for sublist in output for item in sublist] # Deduplicated the list - return list(set(flattened_output)) + unique_output = deduplicate_with_order(flattened_output) + + if len(unique_output) == 0: + raise ParseException('No dependent table in select query') + return unique_output def parse_select_columns(string): @@ -97,5 +110,6 @@ def parse_column_name(string): words = Word(printables.replace('\n\r', '')).searchString(string) # Get the last word matched + # TODO: Make it more complicated name = words.pop().asList().pop() return name diff --git a/dataduct/database/parsers/tests/test_select_query.py b/dataduct/database/parsers/tests/test_select_query.py new file mode 100644 index 0000000..18e9ce9 --- /dev/null +++ b/dataduct/database/parsers/tests/test_select_query.py @@ -0,0 +1,61 @@ +"""Tests for select statement parser +""" + +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import raises +from pyparsing import ParseException + +from ..select_query import parse_select_dependencies +from ..select_query import parse_select_columns +from ..select_query import parse_column_name + + +class TestCreateTableStatement(TestCase): + """Tests for create table + """ + @staticmethod + def test_basic(): + """Basic test for select statement + """ + query = ('SELECT x, y, z AS t FROM abc JOIN pqr USING(y) WHERE x=1') + + dependencies = parse_select_dependencies(query) + eq_(dependencies, ['abc', 'pqr']) + + columns = parse_select_columns(query) + eq_(columns, ['x', 'y', 'z AS t']) + + column_name = parse_column_name(columns[0]) + eq_(column_name, 'x') + + column_name = parse_column_name(columns[2]) + eq_(column_name, 't') + + @staticmethod + @raises(ParseException) + def test_bad_input(): + """Feeding malformed input into create table + """ + query = 'SELECT x, y, z' + parse_select_dependencies(query) + + @staticmethod + def test_columns(): + """Basic test for select statement + """ + query = ('SELECT x' + ',CASE WHEN y=10 THEN 5 ELSE z' + ',CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END' + ',COUNT(1) AS c ' + 'FROM abc') + + result = [ + 'x', + 'CASE WHEN y=10 THEN 5 ELSE z', + 'CASE WHEN x THEN COUNT(MIN(x,y)) ELSE MIN(x) END', + 'COUNT(1) AS c', + ] + + columns = parse_select_columns(query) + eq_(columns, result) diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index fdc0d57..e48f433 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -3,11 +3,10 @@ from pyparsing import alphanums from pyparsing import CaselessKeyword -from pyparsing import CharsNotIn -from pyparsing import OneOrMore -from pyparsing import ZeroOrMore from pyparsing import Combine +from pyparsing import Forward from pyparsing import nums +from pyparsing import OneOrMore from pyparsing import Word @@ -59,6 +58,8 @@ column_types = _smallint | _integer | _bigint | _decimal | _real | _double column_types |= _boolean | _char | _varchar | _date | _timestamp -subquery = Combine('(' + ZeroOrMore(CharsNotIn(')')) + ')') -_word = Word(alphanums+"_-. *`") -def_field = Combine(OneOrMore(_word | subquery)) +subquery = Forward() +_word = Word(alphanums+"_-. *`> Date: Sat, 28 Feb 2015 11:24:31 -0800 Subject: [PATCH 127/175] split statement fix --- .../database/parsers/tests/test_transfrom.py | 8 +++++ dataduct/database/parsers/transform.py | 32 +++++++++--------- dataduct/database/sql/tests/test_sql_utils.py | 33 +++++++++++++++++++ dataduct/steps/column_check.py | 5 ++- dataduct/steps/scripts/column_check_test.py | 9 ++--- 5 files changed, 67 insertions(+), 20 deletions(-) create mode 100644 dataduct/database/sql/tests/test_sql_utils.py diff --git a/dataduct/database/parsers/tests/test_transfrom.py b/dataduct/database/parsers/tests/test_transfrom.py index 531b3da..28a0094 100644 --- a/dataduct/database/parsers/tests/test_transfrom.py +++ b/dataduct/database/parsers/tests/test_transfrom.py @@ -210,3 +210,11 @@ def test_multiple_sql(): eq_(split_statements(remove_empty_statements( remove_comments(data))), result) + + @staticmethod + def test_split_escaped_sql(): + """Split SQL statement with strings that have semicolon + """ + data = "a; xyz='0;0'; c;" + result = ['a', "xyz='0;0'", 'c'] + eq_(split_statements(data), result) diff --git a/dataduct/database/parsers/transform.py b/dataduct/database/parsers/transform.py index ee73953..c675bb4 100644 --- a/dataduct/database/parsers/transform.py +++ b/dataduct/database/parsers/transform.py @@ -5,14 +5,10 @@ from pyparsing import CaselessKeyword from pyparsing import CharsNotIn -from pyparsing import delimitedList from pyparsing import Literal from pyparsing import nestedExpr from pyparsing import OneOrMore -from pyparsing import originalTextFor -from pyparsing import printables from pyparsing import replaceWith -from pyparsing import Word from pyparsing import WordStart from pyparsing import ZeroOrMore @@ -76,7 +72,7 @@ def remove_transactional(string): return transaction.suppress().transformString(string) -def split_statements(string, seperator=';'): +def split_statements(string, seperator=';', quote_char="'"): """Seperate the string based on the seperator Args: @@ -89,16 +85,22 @@ def split_statements(string, seperator=';'): if string == '': return [] - # words can contain anything but the seperator - printables_less_seperator = printables.replace(seperator, '') - - # capture content between seperators, and preserve original text - content = originalTextFor(OneOrMore(Word(printables_less_seperator))) - - # process the string - tokens = delimitedList(content, seperator).parseString(string) - - return tokens.asList() + # We can not directly split a sql statement as we want to skip on + # semicolons inside a string in the sql query. + stack = 0 + result = [] + statement = '' + for char in string: + if char == seperator and not stack % 2: + result.append(statement.strip()) + statement = '' + else: + statement += char + if char == quote_char: + stack += 1 + if statement.strip(): + result.append(statement.strip()) + return result def remove_newlines(string): diff --git a/dataduct/database/sql/tests/test_sql_utils.py b/dataduct/database/sql/tests/test_sql_utils.py new file mode 100644 index 0000000..e3129ca --- /dev/null +++ b/dataduct/database/sql/tests/test_sql_utils.py @@ -0,0 +1,33 @@ +"""Tests the utils functions +""" +from unittest import TestCase +from nose.tools import eq_ +from nose.tools import assert_not_equal + +from ..utils import balanced_parenthesis +from ..utils import sanatize_sql + + +class TestSqlUtils(TestCase): + """Tests for sql utils function + """ + @staticmethod + def test_balanced_paranthesis(): + """Test for balanced_parenthesis + """ + eq_(balanced_parenthesis('SELECT 1;'), True) + eq_(balanced_parenthesis('SELECT 1(;'), False) + eq_(balanced_parenthesis('SELECT 1();'), True) + eq_(balanced_parenthesis('SELECT 1(abcd);'), True) + eq_(balanced_parenthesis('SELECT 1(ab[cd);'), True) + eq_(balanced_parenthesis('SELECT 1(ab[cd));'), False) + eq_(balanced_parenthesis('SELECT 1);'), False) + eq_(balanced_parenthesis('SELECT 1(ab)(ab);'), True) + eq_(balanced_parenthesis('SELECT 1(a(ab)b);'), True) + + @staticmethod + def test_sanatize_sql(): + """Test for sanatize_sql + """ + sql = "SELECT 1 if x='x;y'; SELECT 1 ;" + eq_(sanatize_sql(sql), ["SELECT 1 if x='x;y'", 'SELECT 1']) diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py index 82dc8ea..33b4072 100644 --- a/dataduct/steps/column_check.py +++ b/dataduct/steps/column_check.py @@ -112,7 +112,10 @@ def convert_source_to_column_sql(source_sql, primary_key_index, """Convert the source query into generic structure to compare """ origin_sql = SelectStatement(SqlScript(source_sql).statements[0].sql()) - column_names = [x.name for x in origin_sql.columns()] + + # Remove column name references to tables as t.session_id should be + # session_id as we wrap the whole query. + column_names = [x.name.split('.')[-1] for x in origin_sql.columns()] non_primary_key_index = [idx for idx in range(len(column_names)) if idx not in primary_key_index] diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py index 58d0d9d..9f3187b 100644 --- a/dataduct/steps/scripts/column_check_test.py +++ b/dataduct/steps/scripts/column_check_test.py @@ -13,8 +13,8 @@ from dataduct.data_access import rds_connection from dataduct.qa import ColumnCheck -pandas.options.display.max_colwidth = 100000 -pandas.options.display.max_rows = 10000 +pandas.options.display.max_colwidth = 1000 +pandas.options.display.max_rows = 1000 def _get_source_data(sql, hostname, sample_size): @@ -114,10 +114,11 @@ def main(): # Open up a connection and read the source and destination tables source_data = _get_source_data(args.source_sql, args.source_host, args.sample_size) - print source_data + print source_data.to_string() + destination_data = _get_destination_data(args.destination_sql, list(source_data.index)) - print destination_data + print destination_data.to_string() check = ColumnCheck(source_data, destination_data, name=args.test_name, From b79f3656c0165fe82df345a39ad66cc254238ed9 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 28 Feb 2015 14:57:52 -0800 Subject: [PATCH 128/175] frequency override --- bin/dataduct | 32 +++++++++++++++++++++++--------- dataduct/steps/column_check.py | 2 +- 2 files changed, 24 insertions(+), 10 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index ac18e85..b6ed2c2 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -8,6 +8,9 @@ import argparse from dataduct.config import Config from dataduct.config import logger_configuration +import logging +logger = logging.getLogger(__name__) + CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' @@ -36,7 +39,8 @@ def config_actions(action, filename): return sync_from_s3(filename) -def initialize_etl_objects(load_definitions, delay=None): +def initialize_etl_objects(load_definitions, delay=None, + frequency_override=None): """Generate etl objects from yaml files """ from dataduct.etl import create_pipeline @@ -47,17 +51,21 @@ def initialize_etl_objects(load_definitions, delay=None): definition = read_pipeline_definition(load_definition) if delay is not None: definition.update({'delay': delay}) + if frequency_override is not None: + definition.update({'frequency': frequency_override}) etls.append(create_pipeline(definition)) return etls -def pipeline_actions(action, load_definitions, force_overwrite, delay): +def pipeline_actions(action, load_definitions, force_overwrite, delay, + frequency_override=None): """Pipeline related actions are executed in this block """ from dataduct.etl import activate_pipeline from dataduct.etl import validate_pipeline - for etl in initialize_etl_objects(load_definitions, delay): + for etl in initialize_etl_objects(load_definitions, delay, + frequency_override): if action in [VALIDATE_STR, ACTIVATE_STR]: validate_pipeline(etl, force_overwrite) if action == ACTIVATE_STR: @@ -253,20 +261,26 @@ def main(): if mode == DEV: mode = None - # To instantiate the singleton object with the correct state - # As this is the single entry point to the library - # We can use the __new__ function to set the debug_level - config = Config(mode=mode) - print '[WARNING] Running the pipeline in %s mode.' % config.mode + # To instantiate the singleton object with the correct state + # As this is the single entry point to the library + # We can use the __new__ function to set the debug_level + config = Config(mode=mode) # Setup up logging for package logger_configuration() + if mode is not None: + logger.warning('Running the pipeline in %s mode.' % config.mode) + + # Frequency override + # Certain modes in the config can override frequency of a pipeline + frequency_override = config.etl.get('FREQUENCY_OVERRIDE', None) + if args.command == CONFIG_COMMAND: config_actions(args.action, args.filename) elif args.command == PIPELINE_COMMAND: pipeline_actions(args.action, args.load_definitions, - args.force_overwrite, args.delay) + args.force_overwrite, args.delay, frequency_override) elif args.command == DATABASE_COMMAND: database_actions(args.action, args.table_definitions) else: diff --git a/dataduct/steps/column_check.py b/dataduct/steps/column_check.py index 33b4072..3b0ddaa 100644 --- a/dataduct/steps/column_check.py +++ b/dataduct/steps/column_check.py @@ -131,7 +131,7 @@ def convert_source_to_column_sql(source_sql, primary_key_index, for idx in non_primary_key_index]) concatenated_column = ('CONCAT(%s)' % column_string) - template = '''SELECT {primary_keys}, {concat_column} + template = '''SELECT {primary_keys}, {concat_column} AS merged_string FROM ({origin_sql}) AS origin {sql_tail}''' query = template.format(primary_keys=primary_key_str, From cdd1103b89ccb992e940c8a3abcab4c14624df17 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 28 Feb 2015 16:09:01 -0800 Subject: [PATCH 129/175] time input and transaction wrapper --- bin/dataduct | 2 ++ dataduct/etl/etl_pipeline.py | 4 +++- dataduct/steps/sql_command.py | 12 ++++++++++-- examples/example_bootstrap.yaml | 6 +++--- examples/example_column_check.yaml | 6 +++--- examples/example_count_check.yaml | 8 ++++---- examples/example_create_and_load_redshift.yaml | 6 +++--- examples/example_custom_extract_local.yaml | 6 +++--- examples/example_double_input.yaml | 6 +++--- examples/example_double_output.yaml | 6 +++--- examples/example_emr_streaming.yaml | 6 +++--- examples/example_extract_local.yaml | 6 +++--- examples/example_extract_rds.yaml | 6 +++--- examples/example_extract_redshift.yaml | 7 +++---- examples/example_extract_s3.yaml | 7 +++---- examples/example_load_redshift.yaml | 6 +++--- examples/example_pipeline_dependency.yaml | 4 ++-- examples/example_primary_key_check.yaml | 6 +++--- examples/example_reload.yaml | 6 +++--- examples/example_sql_command.yaml | 6 +++--- examples/example_transform.yaml | 6 +++--- examples/example_upsert.yaml | 6 +++--- 22 files changed, 72 insertions(+), 62 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index b6ed2c2..b326acb 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -276,6 +276,8 @@ def main(): # Certain modes in the config can override frequency of a pipeline frequency_override = config.etl.get('FREQUENCY_OVERRIDE', None) + + # Action parse if args.command == CONFIG_COMMAND: config_actions(args.action, args.filename) elif args.command == PIPELINE_COMMAND: diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 32b62fc..87e1735 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -69,8 +69,10 @@ def __init__(self, name, frequency='one-time', bootstrap(list of steps): bootstrap step definitions for resources """ - if load_time: + if load_time and isinstance(load_time, str): load_hour, load_min = [int(x) for x in load_time.split(':')] + elif load_time and isinstance(load_time, int): + load_hour, load_min = (load_time / 60, load_time % 60) else: load_hour, load_min = [None, None] diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index ab89686..05bf764 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -3,8 +3,10 @@ """ from .etl_step import ETLStep from ..pipeline import SqlActivity +from ..database import SqlScript from ..s3 import S3File from ..utils.helpers import exactly_one +from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError @@ -18,6 +20,7 @@ def __init__(self, script_arguments=None, queue=None, command=None, + wrap_transaction=True, **kwargs): """Constructor for the SqlCommandStep class @@ -36,9 +39,14 @@ def __init__(self, # Create S3File with script / command provided if script: - script = self.create_script(S3File(path=script)) + sql_script = SqlScript(filename=parse_path(script)) else: - script = self.create_script(S3File(text=command)) + sql_script = SqlScript(command) + + if wrap_transaction: + sql_script = sql_script.wrap_transaction() + + script = self.create_script(S3File(text=sql_script.sql())) self.create_pipeline_object( object_class=SqlActivity, diff --git a/examples/example_bootstrap.yaml b/examples/example_bootstrap.yaml index 3890e8c..b55fa72 100644 --- a/examples/example_bootstrap.yaml +++ b/examples/example_bootstrap.yaml @@ -1,8 +1,8 @@ -name : example_bootstrap -frequency : one-time +name: example_bootstrap +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the transform step +description: Example for the transform step bootstrap: ec2: diff --git a/examples/example_column_check.yaml b/examples/example_column_check.yaml index abd8921..201e7b7 100644 --- a/examples/example_column_check.yaml +++ b/examples/example_column_check.yaml @@ -1,8 +1,8 @@ -name : example_column_check -frequency : one-time +name: example_column_check +frequency: one-time load_time: 01:00 -description : Example for the column-check step +description: Example for the column-check step steps: - step_type: column-check diff --git a/examples/example_count_check.yaml b/examples/example_count_check.yaml index 65a02e2..f2504cd 100644 --- a/examples/example_count_check.yaml +++ b/examples/example_count_check.yaml @@ -1,8 +1,8 @@ -name : example_count_check -frequency : one-time -load_time : 01:00 +name: example_count_check +frequency: one-time +load_time: 01:00 -description : Example for the count-check step +description: Example for the count-check step steps: - step_type: count-check diff --git a/examples/example_create_and_load_redshift.yaml b/examples/example_create_and_load_redshift.yaml index 1be0c4d..57c2f9d 100644 --- a/examples/example_create_and_load_redshift.yaml +++ b/examples/example_create_and_load_redshift.yaml @@ -1,8 +1,8 @@ -name : example_create_and_load_redshift -frequency : one-time +name: example_create_and_load_redshift +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the load_redshift step +description: Example for the load_redshift step steps: - step_type: extract-local diff --git a/examples/example_custom_extract_local.yaml b/examples/example_custom_extract_local.yaml index 809df06..fa14c4c 100644 --- a/examples/example_custom_extract_local.yaml +++ b/examples/example_custom_extract_local.yaml @@ -1,8 +1,8 @@ -name : example_custom_extract_local -frequency : one-time +name: example_custom_extract_local +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : | +description: | This example uploads a local file to S3 with the extract-local step. steps: diff --git a/examples/example_double_input.yaml b/examples/example_double_input.yaml index 3cd3353..da73c19 100644 --- a/examples/example_double_input.yaml +++ b/examples/example_double_input.yaml @@ -1,8 +1,8 @@ -name : example_double_input -frequency : one-time +name: example_double_input +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the transform step with multiple inputs +description: Example for the transform step with multiple inputs steps: - step_type: extract-local diff --git a/examples/example_double_output.yaml b/examples/example_double_output.yaml index 9c06d07..0104ea7 100644 --- a/examples/example_double_output.yaml +++ b/examples/example_double_output.yaml @@ -1,8 +1,8 @@ -name : example_double_output -frequency : one-time +name: example_double_output +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the transform step with multiple outputs +description: Example for the transform step with multiple outputs steps: - step_type: extract-local diff --git a/examples/example_emr_streaming.yaml b/examples/example_emr_streaming.yaml index 1acc0ec..05fa2a6 100644 --- a/examples/example_emr_streaming.yaml +++ b/examples/example_emr_streaming.yaml @@ -1,12 +1,12 @@ -name : example_emr_streaming -frequency : one-time +name: example_emr_streaming +frequency: one-time load_time: 01:00 # Hour:Min in UTC emr_cluster_config: num_instances: 1 instance_size: m1.large ami_version: 3.3.1 -description : Example for the emr_streaming step +description: Example for the emr_streaming step steps: - step_type: extract-local diff --git a/examples/example_extract_local.yaml b/examples/example_extract_local.yaml index 377be7e..5ab1a5d 100644 --- a/examples/example_extract_local.yaml +++ b/examples/example_extract_local.yaml @@ -1,8 +1,8 @@ -name : example_extract_local -frequency : one-time +name: example_extract_local +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : | +description: | This example uploads a local file to S3 with the extract-local step. steps: diff --git a/examples/example_extract_rds.yaml b/examples/example_extract_rds.yaml index a6b002f..c8a382b 100644 --- a/examples/example_extract_rds.yaml +++ b/examples/example_extract_rds.yaml @@ -1,8 +1,8 @@ -name : example_extract_rds -frequency : one-time +name: example_extract_rds +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : | +description: | This example extracts data from mysql to S3 with the extract-rds step. steps: diff --git a/examples/example_extract_redshift.yaml b/examples/example_extract_redshift.yaml index 699673d..6e18f62 100644 --- a/examples/example_extract_redshift.yaml +++ b/examples/example_extract_redshift.yaml @@ -1,9 +1,8 @@ -name : example_extract_redshift -frequency : one-time +name: example_extract_redshift +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : | - This example extracts data out of redshift +description: This example extracts data out of redshift steps: - step_type: extract-redshift diff --git a/examples/example_extract_s3.yaml b/examples/example_extract_s3.yaml index febaf6f..cfbbd2b 100644 --- a/examples/example_extract_s3.yaml +++ b/examples/example_extract_s3.yaml @@ -1,9 +1,8 @@ -name : example_extract_s3 -frequency : one-time +name: example_extract_s3 +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : | - This example creates an S3Node given a S3 Uri +description: This example creates an S3Node given a S3 Uri steps: - step_type: extract-s3 diff --git a/examples/example_load_redshift.yaml b/examples/example_load_redshift.yaml index 1082641..06bebf3 100644 --- a/examples/example_load_redshift.yaml +++ b/examples/example_load_redshift.yaml @@ -1,8 +1,8 @@ -name : example_load_redshift -frequency : one-time +name: example_load_redshift +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the load_redshift step +description: Example for the load_redshift step steps: - step_type: extract-local diff --git a/examples/example_pipeline_dependency.yaml b/examples/example_pipeline_dependency.yaml index 3e0d889..70a2b2e 100644 --- a/examples/example_pipeline_dependency.yaml +++ b/examples/example_pipeline_dependency.yaml @@ -1,5 +1,5 @@ -name : example_pipeline_dependency -frequency : one-time +name: example_pipeline_dependency +frequency: one-time load_time: 01:00 # Hour:Min in UTC steps: diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml index 4b63d84..3cf3822 100644 --- a/examples/example_primary_key_check.yaml +++ b/examples/example_primary_key_check.yaml @@ -1,8 +1,8 @@ -name : example_primary_key_check -frequency : one-time +name: example_primary_key_check +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the primary-key-check step +description: Example for the primary-key-check step steps: - step_type: primary-key-check diff --git a/examples/example_reload.yaml b/examples/example_reload.yaml index 287a7c9..073b16c 100644 --- a/examples/example_reload.yaml +++ b/examples/example_reload.yaml @@ -1,8 +1,8 @@ -name : example_reload -frequency : one-time +name: example_reload +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the reload step +description: Example for the reload step steps: - step_type: extract-local diff --git a/examples/example_sql_command.yaml b/examples/example_sql_command.yaml index de8c180..5c13c7c 100644 --- a/examples/example_sql_command.yaml +++ b/examples/example_sql_command.yaml @@ -1,8 +1,8 @@ -name : example_sql_command -frequency : one-time +name: example_sql_command +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the sql_command step +description: Example for the sql_command step steps: - step_type: sql-command diff --git a/examples/example_transform.yaml b/examples/example_transform.yaml index 0fa2b32..f1b0e73 100644 --- a/examples/example_transform.yaml +++ b/examples/example_transform.yaml @@ -1,9 +1,9 @@ -name : example_transform -frequency : one-time +name: example_transform +frequency: one-time load_time: 01:00 # Hour:Min in UTC ec2_resource_instance_type: m1.small -description : Example for the transform step, uses an m1.small instance instead of the default +description: Example for the transform step, uses an m1.small instance instead of the default steps: - step_type: extract-local diff --git a/examples/example_upsert.yaml b/examples/example_upsert.yaml index b882a86..e0f54a2 100644 --- a/examples/example_upsert.yaml +++ b/examples/example_upsert.yaml @@ -1,8 +1,8 @@ -name : example_upsert -frequency : one-time +name: example_upsert +frequency: one-time load_time: 01:00 # Hour:Min in UTC -description : Example for the upsert step +description: Example for the upsert step steps: - step_type: extract-local From a8920beb2c85278f52191103de7e8a6af0ca7ccf Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 28 Feb 2015 16:38:49 -0800 Subject: [PATCH 130/175] Spell Check --- dataduct/database/sql/sql_script.py | 8 ++++---- dataduct/database/sql/sql_statement.py | 8 ++++---- dataduct/database/sql/tests/test_sql_utils.py | 9 ++++----- dataduct/database/sql/utils.py | 2 +- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/dataduct/database/sql/sql_script.py b/dataduct/database/sql/sql_script.py index 79b5742..7b973aa 100644 --- a/dataduct/database/sql/sql_script.py +++ b/dataduct/database/sql/sql_script.py @@ -5,7 +5,7 @@ from .sql_statement import SqlStatement from .transaction import BeginStatement from .transaction import CommitStatement -from .utils import sanatize_sql +from .utils import sanitize_sql from ...utils.helpers import atmost_one @@ -26,7 +26,7 @@ def __init__(self, sql=None, statements=None, filename=None): sql = f.read() self._raw_sql = sql - self._raw_statements = self._sanatize_sql() + self._raw_statements = self._sanitize_sql() self._statements = self._initialize_statements() # Add the statements that the script was initialized from @@ -59,10 +59,10 @@ def sql(self): """ return ';\n'.join([x.sql() for x in self._statements]) + ';' - def _sanatize_sql(self): + def _sanitize_sql(self): """Clean the SQL, remove comments and empty statements """ - return sanatize_sql(self._raw_sql) + return sanitize_sql(self._raw_sql) def _initialize_statements(self): """Initialize SQL Statements based on the inputscipt diff --git a/dataduct/database/sql/sql_statement.py b/dataduct/database/sql/sql_statement.py index 208ed02..278377f 100644 --- a/dataduct/database/sql/sql_statement.py +++ b/dataduct/database/sql/sql_statement.py @@ -1,7 +1,7 @@ """Script that contains the sql statement class """ from copy import deepcopy -from .utils import sanatize_sql +from .utils import sanitize_sql from ..parsers import parse_create_table from ..parsers import parse_create_view @@ -16,7 +16,7 @@ def __init__(self, sql=None, transactional=False): sql = '' self._raw_sql = sql self.transactional = transactional - self._raw_statement = self._sanatize_sql() + self._raw_statement = self._sanitize_sql() def __str__(self): """Print a SqlStatement object @@ -33,13 +33,13 @@ def sql(self): """ return self._raw_statement - def _sanatize_sql(self): + def _sanitize_sql(self): """Clean the SQL, remove comments and empty statements """ if self._raw_sql is None: return '' - raw_statements = sanatize_sql(self._raw_sql, self.transactional) + raw_statements = sanitize_sql(self._raw_sql, self.transactional) if len(raw_statements) > 1: raise ValueError('SQL Statement can not contain more than 1 query') diff --git a/dataduct/database/sql/tests/test_sql_utils.py b/dataduct/database/sql/tests/test_sql_utils.py index e3129ca..7222feb 100644 --- a/dataduct/database/sql/tests/test_sql_utils.py +++ b/dataduct/database/sql/tests/test_sql_utils.py @@ -2,10 +2,9 @@ """ from unittest import TestCase from nose.tools import eq_ -from nose.tools import assert_not_equal from ..utils import balanced_parenthesis -from ..utils import sanatize_sql +from ..utils import sanitize_sql class TestSqlUtils(TestCase): @@ -26,8 +25,8 @@ def test_balanced_paranthesis(): eq_(balanced_parenthesis('SELECT 1(a(ab)b);'), True) @staticmethod - def test_sanatize_sql(): - """Test for sanatize_sql + def test_sanitize_sql(): + """Test for sanitize_sql """ sql = "SELECT 1 if x='x;y'; SELECT 1 ;" - eq_(sanatize_sql(sql), ["SELECT 1 if x='x;y'", 'SELECT 1']) + eq_(sanitize_sql(sql), ["SELECT 1 if x='x;y'", 'SELECT 1']) diff --git a/dataduct/database/sql/utils.py b/dataduct/database/sql/utils.py index c179bd5..55af3f1 100644 --- a/dataduct/database/sql/utils.py +++ b/dataduct/database/sql/utils.py @@ -22,7 +22,7 @@ def balanced_parenthesis(statement): return counter == 0 -def sanatize_sql(sql, keep_transaction=False): +def sanitize_sql(sql, keep_transaction=False): """Sanatize the sql string """ # remove comments From 76e99b160c2a4efdfbc5c519b91f30b27a93fcab Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 28 Feb 2015 21:45:12 -0800 Subject: [PATCH 131/175] Add more characters to parser --- dataduct/database/parsers/utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index e48f433..95cb6d9 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -58,8 +58,10 @@ column_types = _smallint | _integer | _bigint | _decimal | _real | _double column_types |= _boolean | _char | _varchar | _date | _timestamp -subquery = Forward() -_word = Word(alphanums+"_-. *`> Date: Sat, 28 Feb 2015 23:31:54 -0800 Subject: [PATCH 132/175] Ec2 inputs should all be modifiable --- dataduct/etl/etl_pipeline.py | 18 +++++++----------- dataduct/etl/tests/test_etl_pipeline.py | 6 ++---- examples/example_transform.yaml | 3 ++- 3 files changed, 11 insertions(+), 16 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 87e1735..cf86f2d 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -39,7 +39,6 @@ SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) NAME_PREFIX = config.etl.get('NAME_PREFIX', const.EMPTY_STR) DP_INSTANCE_LOG_PATH = config.etl.get('DP_INSTANCE_LOG_PATH', const.NONE) -INSTANCE_TYPE = config.ec2.get('INSTANCE_TYPE', const.M1_LARGE) class ETLPipeline(object): @@ -49,9 +48,7 @@ class ETLPipeline(object): and has functionality to add steps to the pipeline """ - def __init__(self, name, frequency='one-time', - ec2_resource_terminate_after='6 Hours', - ec2_resource_instance_type=INSTANCE_TYPE, + def __init__(self, name, frequency='one-time', ec2_resource_config=None, delay=0, emr_cluster_config=None, load_time=None, topic_arn=None, max_retries=MAX_RETRIES, bootstrap=None, description=None): @@ -60,8 +57,6 @@ def __init__(self, name, frequency='one-time', Args: name (str): Name of the pipeline should be globally unique. frequency (enum): Frequency of the pipeline. Can be - ec2_resource_terminate_after (str): Timeout for ec2 resource - ec2_resource_instance_type (str): Instance type for ec2 resource delay(int): Number of days to delay the pipeline by emr_cluster_config(dict): Dictionary for emr config topic_arn(str): sns alert to be used by the pipeline @@ -79,8 +74,6 @@ def __init__(self, name, frequency='one-time', # Input variables self._name = name if not NAME_PREFIX else NAME_PREFIX + '_' + name self.frequency = frequency - self.ec2_resource_terminate_after = ec2_resource_terminate_after - self.ec2_resource_instance_type = ec2_resource_instance_type self.load_hour = load_hour self.load_min = load_min self.delay = delay @@ -100,6 +93,11 @@ def __init__(self, name, frequency='one-time', else: self.emr_cluster_config = dict() + if ec2_resource_config: + self.ec2_resource_config = ec2_resource_config + else: + self.ec2_resource_config = dict() + # Pipeline versions self.version_ts = datetime.utcnow() self.version_name = "version_" + \ @@ -275,10 +273,8 @@ def ec2_resource(self): object_class=Ec2Resource, s3_log_dir=self.s3_log_dir, schedule=self.schedule, - terminate_after=self.ec2_resource_terminate_after, - instance_type=self.ec2_resource_instance_type, + **self.ec2_resource_config ) - self.create_bootstrap_steps(const.EC2_RESOURCE_STR) return self._ec2_resource diff --git a/dataduct/etl/tests/test_etl_pipeline.py b/dataduct/etl/tests/test_etl_pipeline.py index 17ad436..64fd270 100644 --- a/dataduct/etl/tests/test_etl_pipeline.py +++ b/dataduct/etl/tests/test_etl_pipeline.py @@ -24,8 +24,7 @@ def test_construct_etl_pipeline(): result = ETLPipeline( 'test_pipeline', frequency='one-time', - ec2_resource_terminate_after='2 Hours', - ec2_resource_instance_type='m1.small', + ec2_resource_config={'terminate_after':'2 Hours'}, delay=13, emr_cluster_config={'cfg1': 'value'}, load_time='12:34', @@ -35,8 +34,7 @@ def test_construct_etl_pipeline(): ) assert result.name.endswith('test_pipeline') eq_(result.frequency, 'one-time') - eq_(result.ec2_resource_terminate_after, '2 Hours') - eq_(result.ec2_resource_instance_type, 'm1.small') + eq_(result.ec2_resource_config, {'terminate_after':'2 Hours'}) eq_(result.load_hour, 12) eq_(result.load_min, 34) eq_(result.delay, 13) diff --git a/examples/example_transform.yaml b/examples/example_transform.yaml index f1b0e73..bb7af42 100644 --- a/examples/example_transform.yaml +++ b/examples/example_transform.yaml @@ -1,7 +1,8 @@ name: example_transform frequency: one-time load_time: 01:00 # Hour:Min in UTC -ec2_resource_instance_type: m1.small +ec2_resource_config: + instance_type: m1.small description: Example for the transform step, uses an m1.small instance instead of the default From ca7357b4f6626c321534d071eea5ce8bae29c5ee Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 1 Mar 2015 08:02:01 -0800 Subject: [PATCH 133/175] Comments on parser --- dataduct/database/parsers/create_table.py | 4 ++-- dataduct/database/parsers/select_query.py | 4 ++-- dataduct/database/parsers/utils.py | 12 ++++++++++-- examples/example_transform.yaml | 4 +++- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/dataduct/database/parsers/create_table.py b/dataduct/database/parsers/create_table.py index e21869d..f2372c6 100644 --- a/dataduct/database/parsers/create_table.py +++ b/dataduct/database/parsers/create_table.py @@ -19,7 +19,7 @@ from .utils import _sortkey from .utils import _table from .utils import column_types -from .utils import def_field +from .utils import field_parser from .utils import pk_check from .helpers import existance_check @@ -65,7 +65,7 @@ def get_base_parser(): table_definition(pyparsing): Parser for create table statements """ table_def = get_definition_start() + \ - paranthesis_list('raw_fields', def_field) + \ + paranthesis_list('raw_fields', field_parser) + \ get_attributes_parser() return table_def diff --git a/dataduct/database/parsers/select_query.py b/dataduct/database/parsers/select_query.py index 7f6d5ac..238d5e6 100644 --- a/dataduct/database/parsers/select_query.py +++ b/dataduct/database/parsers/select_query.py @@ -12,7 +12,7 @@ from .utils import _from from .utils import _join from .utils import _select -from .utils import def_field +from .utils import field_parser def deduplicate_with_order(seq): @@ -87,7 +87,7 @@ def parse_select_columns(string): suppressor = MatchFirst(_from) + restOfLine string = suppressor.suppress().transformString(string) - parser = _select + delimitedList(def_field).setResultsName('columns') + parser = _select + delimitedList(field_parser).setResultsName('columns') output = parser.parseString(string).columns.asList() # Strip extra whitespace from the string diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index 95cb6d9..b260849 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -55,13 +55,21 @@ _db_name = Word(alphanums+"_-.") pk_check = (_primary_key | _unique) +# Column types column_types = _smallint | _integer | _bigint | _decimal | _real | _double column_types |= _boolean | _char | _varchar | _date | _timestamp -def_field = Forward() +# Define a field parser for create table fields or select query fields +field_parser = Forward() subquery = Forward() + +# List of characters allowed in the query statements special_character = "_-. *`> Date: Sun, 1 Mar 2015 12:37:38 -0800 Subject: [PATCH 134/175] autocomplete --- bin/dataduct | 6 ++++++ setup.py | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/bin/dataduct b/bin/dataduct index b326acb..e58bc04 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -1,4 +1,5 @@ #!/usr/bin/env python +# PYTHON_ARGCOMPLETE_OK """Script that helps create and validate pipelines from command line """ @@ -253,6 +254,11 @@ def main(): help='Enter the paths of the table definitions', ) + try: + import argcomplete + argcomplete.autocomplete(parser) + except ImportError: + pass args = parser.parse_args() mode = args.mode diff --git a/setup.py b/setup.py index 138d2c1..993f35b 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,8 @@ 'MySQL-python', 'pyparsing', 'testfixtures', - 'sphinx_rtd_theme' + 'sphinx_rtd_theme', + 'argcomplete' ], scripts=['bin/dataduct'], classifiers=[ From 97c952dd0b97cb2b27799bb65ea97384a250ce8f Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 1 Mar 2015 15:44:02 -0800 Subject: [PATCH 135/175] cli refactor for more intuitive output --- bin/dataduct | 219 +++++++++++++++++----------------- dataduct/database/database.py | 4 +- 2 files changed, 113 insertions(+), 110 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index e58bc04..387b524 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -5,6 +5,7 @@ """ import argparse +from argparse import RawTextHelpFormatter from dataduct.config import Config from dataduct.config import logger_configuration @@ -15,6 +16,7 @@ logger = logging.getLogger(__name__) CREATE_STR = 'create' VALIDATE_STR = 'validate' ACTIVATE_STR = 'activate' +VISUALIZE_STR = 'visualize' DROP_STR = 'drop' GRANT_STR = 'grant' RECREATE_STR = 'recreate' @@ -25,10 +27,11 @@ CONFIG_FROM_S3 = 'sync_from_s3' CONFIG_COMMAND = 'config' DATABASE_COMMAND = 'database' PIPELINE_COMMAND = 'pipeline' -VISUALIZE_COMMAND = 'visualize' - DEV = 'dev' +formatter_class = lambda prog: RawTextHelpFormatter(prog, max_help_position=50) + + def config_actions(action, filename): """Config related actions are executed in this block """ @@ -58,12 +61,14 @@ def initialize_etl_objects(load_definitions, delay=None, return etls -def pipeline_actions(action, load_definitions, force_overwrite, delay, - frequency_override=None): +def pipeline_actions(action, load_definitions, force_overwrite=None, + delay=None, frequency_override=None, + activities_only=None, filename=None): """Pipeline related actions are executed in this block """ from dataduct.etl import activate_pipeline from dataduct.etl import validate_pipeline + from dataduct.etl import visualize_pipeline for etl in initialize_etl_objects(load_definitions, delay, frequency_override): @@ -71,9 +76,11 @@ def pipeline_actions(action, load_definitions, force_overwrite, delay, validate_pipeline(etl, force_overwrite) if action == ACTIVATE_STR: activate_pipeline(etl) + if action == VISUALIZE_STR: + visualize_pipeline(etl, activities_only, filename) -def database_actions(action, table_definitions): +def database_actions(action, table_definitions, filename=None): """Database related actions are executed in this block """ from dataduct.database import Database @@ -87,36 +94,18 @@ def database_actions(action, table_definitions): script = database.grant_relations_script() elif action == RECREATE_STR: script = database.recreate_relations_script() + elif action == VISUALIZE_STR: + database.visualize(filename) + script = '' print script -def visualize_pipeline_actions(load_definitions, activities_only, filename): - """Visualization actions for pipelines are executed in this block - """ - - from dataduct.etl import visualize_pipeline - - for etl in initialize_etl_objects(load_definitions): - visualize_pipeline(etl, activities_only, filename) - - -def visualize_database_actions(table_definitions, filename): - """Visualization actions for databases are executed in this block - """ - - from dataduct.database import Database - - database = Database(files=table_definitions) - database.visualize(filename) - - class _HelpAction(argparse._HelpAction): """HelpAction class used to render a custom help message """ def __call__(self, parser, namespace, values, option_string=None): parser.print_help() print '' - # retrieve subparsers from parser subparsers_actions = [ action for action in parser._actions @@ -127,57 +116,99 @@ class _HelpAction(argparse._HelpAction): for choice, subparser in subparsers_action.choices.items(): print "Command '{}'".format(choice) print subparser.format_usage() - parser.exit() +def choice_help_formatter(choices): + """Create help format for a choice argument + """ + result = [] + for key, value in choices.iteritems(): + result.append('%s: %s' % (key, value)) + return '\n'.join(result) + + def main(): - """Main function""" - parser = argparse.ArgumentParser(description='Run Dataduct commands', - add_help=False) - parser.add_argument( + """Main function + """ + + # Overwrite default help + help_parser = argparse.ArgumentParser( + description='Help message', + add_help=False) + help_parser.add_argument( + '-h', + '--help', + action=_HelpAction, + help='Help message', + ) + + # Base parser for all commands + base_parser = argparse.ArgumentParser( + description='Base Parser', + add_help=False, + ) + base_parser.add_argument( + '-F', + '--filename', + default=None, + help='Filename to store output of commands', + ) + base_parser.add_argument( '-m', '--mode', default=None, help='Mode to run the pipeline and config overrides to use', ) - # Overwrite default help - parser.add_argument( - '-h', - '--help', - action=_HelpAction, - help='Show this help message and exit', + + # Main parser + parser = argparse.ArgumentParser( + description='Run Dataduct commands', + add_help=False, + parents=[help_parser], + formatter_class=formatter_class, ) - subparsers = parser.add_subparsers(help='Commands', dest='command') + subparsers = parser.add_subparsers( + dest='command', + help='Actions for various features') # Config parser declaration - config_parser = subparsers.add_parser(CONFIG_COMMAND) + config_choices = { + CONFIG_TO_S3: 'sync config file from local to s3', + CONFIG_FROM_S3: 'sync config file from s3 to local file', + } + config_parser = subparsers.add_parser( + CONFIG_COMMAND, + parents=[base_parser], + formatter_class=formatter_class, + help='Command to sync config to and from S3' + ) config_parser.add_argument( 'action', type=str, - choices={ - CONFIG_TO_S3: 'sync config file from local to s3', - CONFIG_FROM_S3: 'sync config file from s3 to local file', - }, + choices=config_choices, + help=choice_help_formatter(config_choices), default=CONFIG_FROM_S3, ) - config_parser.add_argument( - '-f', - '--filename', - default=None, - help='Filename to sync', - ) # Pipeline parser declaration - pipeline_parser = subparsers.add_parser(PIPELINE_COMMAND) + pipeline_choices = { + CREATE_STR: 'Create a pipeline locally', + VALIDATE_STR: 'Validate a pipeline with AWS without activating', + ACTIVATE_STR: 'Activate the pipeline on AWS', + VISUALIZE_STR: 'Visualize the pipeline', + } + pipeline_parser = subparsers.add_parser( + PIPELINE_COMMAND, + parents=[base_parser], + formatter_class=formatter_class, + help='Command for various operations on pipeline definitions', + ) pipeline_parser.add_argument( 'action', type=str, - choices={ - CREATE_STR: 'Create a pipeline locally', - VALIDATE_STR: 'Validate a pipeline with AWS without activating', - ACTIVATE_STR: 'create a pipeline and activate it on AWS', - }, + choices=pipeline_choices, + help=choice_help_formatter(pipeline_choices), default=CREATE_STR, ) pipeline_parser.add_argument( @@ -199,18 +230,31 @@ def main(): type=int, help='Delay the pipeline by x days', ) + pipeline_parser.add_argument( + '--activities_only', + action='store_true', + help='Visualize only activities', + ) # Database parser declaration - database_parser = subparsers.add_parser(DATABASE_COMMAND) + database_choices = { + CREATE_STR: 'Create tables', + DROP_STR: 'Drop views and tables', + GRANT_STR: 'Grant permissions to neccessary groups', + RECREATE_STR: 'Recreate tables, load new data, drop old tables', + VISUALIZE_STR: 'Visualize the database er-diagram', + } + database_parser = subparsers.add_parser( + DATABASE_COMMAND, + parents=[base_parser], + formatter_class=formatter_class, + help='Command for various operations on the database', + ) database_parser.add_argument( 'action', type=str, - choices={ - CREATE_STR: 'Create tables', - DROP_STR: 'Drop views and tables', - GRANT_STR: 'Grant permissions to neccessary groups', - RECREATE_STR: 'Recreate tables, load new data, drop old tables', - }, + choices=database_choices, + help=choice_help_formatter(database_choices), ) database_parser.add_argument( 'table_definitions', @@ -218,42 +262,7 @@ def main(): help='Enter the paths of the table definitions', ) - # Visualize parser declaration - visualize_parser = subparsers.add_parser(VISUALIZE_COMMAND) - visualize_subparsers = visualize_parser.add_subparsers( - help='Commands', dest='visualize_command') - - # Visualize pipeline parser declaration - visualize_pipeline_parser = \ - visualize_subparsers.add_parser(PIPELINE_COMMAND) - visualize_pipeline_parser.add_argument( - 'filename', - help='Filename for the graph', - ) - visualize_pipeline_parser.add_argument( - 'load_definitions', - nargs='+', - help='Enter the paths of the load definitions', - ) - visualize_pipeline_parser.add_argument( - '--activities-only', - action='store_true', - help='Visualize only activities', - ) - - # Visualize database parser declaration - visualize_database_parser = \ - visualize_subparsers.add_parser(DATABASE_COMMAND) - visualize_database_parser.add_argument( - 'filename', - help='Filename for the graph', - ) - visualize_database_parser.add_argument( - 'table_definitions', - nargs='+', - help='Enter the paths of the table definitions', - ) - + # Check if autocomplete is possible try: import argcomplete argcomplete.autocomplete(parser) @@ -261,6 +270,7 @@ def main(): pass args = parser.parse_args() + # Check if mode is dev mode = args.mode if mode is not None: # We assume mode:dev = mode:None @@ -282,22 +292,17 @@ def main(): # Certain modes in the config can override frequency of a pipeline frequency_override = config.etl.get('FREQUENCY_OVERRIDE', None) - # Action parse if args.command == CONFIG_COMMAND: config_actions(args.action, args.filename) elif args.command == PIPELINE_COMMAND: pipeline_actions(args.action, args.load_definitions, - args.force_overwrite, args.delay, frequency_override) + args.force_overwrite, args.delay, frequency_override, + args.activities_only, args.filename) elif args.command == DATABASE_COMMAND: - database_actions(args.action, args.table_definitions) + database_actions(args.action, args.table_definitions, args.filename) else: - if args.visualize_command == PIPELINE_COMMAND: - visualize_pipeline_actions( - args.load_definitions, args.activities_only, args.filename) - else: - visualize_database_actions( - args.table_definitions, args.filename) + raise ValueError('Unknown argument provided, use dataduct') if __name__ == '__main__': diff --git a/dataduct/database/database.py b/dataduct/database/database.py index bd86d72..0589093 100644 --- a/dataduct/database/database.py +++ b/dataduct/database/database.py @@ -8,8 +8,6 @@ from .sql import SqlScript from ..utils.helpers import atmost_one -from ..utils.helpers import parse_path - from ..utils.exceptions import DatabaseInputError import logging @@ -46,7 +44,7 @@ def _initialize_relations(files): """ relations = [] for filename in files: - with open(parse_path(filename)) as f: + with open(filename) as f: script = SqlScript(f.read()) if script.creates_table(): relations.append(Table(script)) From a5b52e2146fc92565f3c8d5df9575c0e5a06b19c Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 2 Mar 2015 02:46:32 -0800 Subject: [PATCH 136/175] CLI update for parameter regulation --- bin/dataduct | 407 +++++++++++++++++++----------- dataduct/config/config_actions.py | 2 +- dataduct/etl/etl_actions.py | 6 +- 3 files changed, 259 insertions(+), 156 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index 387b524..7f6dc4a 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -3,8 +3,8 @@ """Script that helps create and validate pipelines from command line """ - import argparse +from argparse import ArgumentParser from argparse import RawTextHelpFormatter from dataduct.config import Config @@ -13,37 +13,25 @@ from dataduct.config import logger_configuration import logging logger = logging.getLogger(__name__) -CREATE_STR = 'create' -VALIDATE_STR = 'validate' -ACTIVATE_STR = 'activate' -VISUALIZE_STR = 'visualize' -DROP_STR = 'drop' -GRANT_STR = 'grant' -RECREATE_STR = 'recreate' +PIPELINE = 'pipeline' +CREATE = 'create' +VALIDATE = 'validate' +ACTIVATE = 'activate' +VISUALIZE = 'visualize' +CONFIG = 'config' CONFIG_TO_S3 = 'sync_to_s3' CONFIG_FROM_S3 = 'sync_from_s3' -CONFIG_COMMAND = 'config' -DATABASE_COMMAND = 'database' -PIPELINE_COMMAND = 'pipeline' -DEV = 'dev' - -formatter_class = lambda prog: RawTextHelpFormatter(prog, max_help_position=50) - - -def config_actions(action, filename): - """Config related actions are executed in this block - """ - from dataduct.config.config_actions import sync_to_s3 - from dataduct.config.config_actions import sync_from_s3 +DATABASE = 'database' +DROP = 'drop' +GRANT = 'grant' +RECREATE = 'recreate' - if action == CONFIG_TO_S3: - return sync_to_s3() - return sync_from_s3(filename) +DEV = 'dev' -def initialize_etl_objects(load_definitions, delay=None, +def initialize_etl_objects(pipeline_definitions, delay=None, frequency_override=None): """Generate etl objects from yaml files """ @@ -51,8 +39,8 @@ def initialize_etl_objects(load_definitions, delay=None, from dataduct.etl import read_pipeline_definition etls = [] - for load_definition in load_definitions: - definition = read_pipeline_definition(load_definition) + for pipeline_definition in pipeline_definitions: + definition = read_pipeline_definition(pipeline_definition) if delay is not None: definition.update({'delay': delay}) if frequency_override is not None: @@ -61,43 +49,57 @@ def initialize_etl_objects(load_definitions, delay=None, return etls -def pipeline_actions(action, load_definitions, force_overwrite=None, - delay=None, frequency_override=None, - activities_only=None, filename=None): +def config_actions(action, filename=None, **kwargs): + """Config related actions are executed in this block + """ + from dataduct.config.config_actions import sync_to_s3 + from dataduct.config.config_actions import sync_from_s3 + + if action == CONFIG_TO_S3: + return sync_to_s3() + return sync_from_s3(filename) + + +def pipeline_actions(action, pipeline_definitions, force=None, delay=None, + frequency_override=None, activities_only=None, + filename=None, **kwargs): """Pipeline related actions are executed in this block """ from dataduct.etl import activate_pipeline from dataduct.etl import validate_pipeline from dataduct.etl import visualize_pipeline - for etl in initialize_etl_objects(load_definitions, delay, + for etl in initialize_etl_objects(pipeline_definitions, delay, frequency_override): - if action in [VALIDATE_STR, ACTIVATE_STR]: - validate_pipeline(etl, force_overwrite) - if action == ACTIVATE_STR: + if action in [VALIDATE, ACTIVATE]: + validate_pipeline(etl, force) + if action == ACTIVATE: activate_pipeline(etl) - if action == VISUALIZE_STR: + if action == VISUALIZE: visualize_pipeline(etl, activities_only, filename) -def database_actions(action, table_definitions, filename=None): +def database_actions(action, table_definitions, filename=None, **kwargs): """Database related actions are executed in this block """ from dataduct.database import Database + script = None database = Database(files=table_definitions) - if action == CREATE_STR: + if action == CREATE: script = database.create_relations_script() - elif action == DROP_STR: + elif action == DROP: script = database.drop_relations_script() - elif action == GRANT_STR: + elif action == GRANT: script = database.grant_relations_script() - elif action == RECREATE_STR: + elif action == RECREATE: script = database.recreate_relations_script() - elif action == VISUALIZE_STR: + elif action == VISUALIZE: database.visualize(filename) - script = '' - print script + + # TODO: Build execution options + if script: + print script class _HelpAction(argparse._HelpAction): @@ -119,23 +121,18 @@ class _HelpAction(argparse._HelpAction): parser.exit() -def choice_help_formatter(choices): - """Create help format for a choice argument - """ - result = [] - for key, value in choices.iteritems(): - result.append('%s: %s' % (key, value)) - return '\n'.join(result) - - def main(): """Main function """ + formatter_class = lambda prog: RawTextHelpFormatter( + prog, max_help_position=50) - # Overwrite default help - help_parser = argparse.ArgumentParser( - description='Help message', - add_help=False) + # Help parser for parsing subparsers in help + help_parser = ArgumentParser( + description='Run Dataduct commands', + add_help=False, + formatter_class=formatter_class, + ) help_parser.add_argument( '-h', '--help', @@ -143,26 +140,76 @@ def main(): help='Help message', ) - # Base parser for all commands - base_parser = argparse.ArgumentParser( - description='Base Parser', + # Mode parser shared across all pipeline subparsers + mode_help = 'Mode to run the pipeline and config overrides to use' + mode_parser = ArgumentParser( + description=mode_help, add_help=False, ) - base_parser.add_argument( - '-F', - '--filename', - default=None, - help='Filename to store output of commands', - ) - base_parser.add_argument( + mode_parser.add_argument( '-m', '--mode', default=None, - help='Mode to run the pipeline and config overrides to use', + help=mode_help + ) + + # Options parser shared actions all pipeline run options + pipeline_run_options = ArgumentParser( + description='Specify actions related to running the pipelines', + add_help=False + ) + pipeline_run_options.add_argument( + '-f', + '--force', + action='store_true', + default=False, + help='Indicates that if this pipeline exists, it will be destroyed', + ) + pipeline_run_options.add_argument( + '-d', + '--delay', + default=0, + type=int, + help='Delay the pipeline by x days', + ) + + # Pipeline definitions parser + pipeline_definition_help = 'Paths of the pipeline definitions' + pipeline_definition_parser = ArgumentParser( + description=pipeline_definition_help, + add_help=False, + ) + pipeline_definition_parser.add_argument( + 'pipeline_definitions', + nargs='+', + help=pipeline_definition_help, + ) + + # Table definitions parser + table_definition_help = 'Paths of the table definitions' + table_definition_parser = ArgumentParser( + description=table_definition_help, + add_help=False, + ) + table_definition_parser.add_argument( + 'table_definitions', + nargs='+', + help=table_definition_help, + ) + + # Filepath input parser + filepath_help = 'filepath input for storing output of actions' + file_parser = ArgumentParser( + description=filepath_help, + add_help=False, + ) + file_parser.add_argument( + dest='filename', + help='Filename to store output of commands', ) # Main parser - parser = argparse.ArgumentParser( + parser = ArgumentParser( description='Run Dataduct commands', add_help=False, parents=[help_parser], @@ -170,96 +217,152 @@ def main(): ) subparsers = parser.add_subparsers( dest='command', - help='Actions for various features') + help='Actions for various features', + ) - # Config parser declaration - config_choices = { - CONFIG_TO_S3: 'sync config file from local to s3', - CONFIG_FROM_S3: 'sync config file from s3 to local file', - } - config_parser = subparsers.add_parser( - CONFIG_COMMAND, - parents=[base_parser], - formatter_class=formatter_class, - help='Command to sync config to and from S3' - ) - config_parser.add_argument( - 'action', - type=str, - choices=config_choices, - help=choice_help_formatter(config_choices), - default=CONFIG_FROM_S3, - ) - - # Pipeline parser declaration - pipeline_choices = { - CREATE_STR: 'Create a pipeline locally', - VALIDATE_STR: 'Validate a pipeline with AWS without activating', - ACTIVATE_STR: 'Activate the pipeline on AWS', - VISUALIZE_STR: 'Visualize the pipeline', - } + # Pipeline parser pipeline_parser = subparsers.add_parser( - PIPELINE_COMMAND, - parents=[base_parser], + PIPELINE, formatter_class=formatter_class, - help='Command for various operations on pipeline definitions', + add_help=False, + parents=[help_parser] ) - pipeline_parser.add_argument( - 'action', - type=str, - choices=pipeline_choices, - help=choice_help_formatter(pipeline_choices), - default=CREATE_STR, + pipeline_subparsers = pipeline_parser.add_subparsers( + dest='action', + help='Pipeline actions', ) - pipeline_parser.add_argument( - 'load_definitions', - nargs='+', - help='Enter the paths of the load definitions', + + # Pipeline subparsers_action + pipeline_subparsers.add_parser( + CREATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + pipeline_run_options, + pipeline_definition_parser, + ], + help='Create a pipeline locally', ) - pipeline_parser.add_argument( - '-f', - '--force_overwrite', - action='store_true', - default=False, - help='Indicates that if this pipeline exists, it will be destroyed', + pipeline_subparsers.add_parser( + VALIDATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + pipeline_run_options, + pipeline_definition_parser, + ], + help='Validate a pipeline with AWS without activating', ) - pipeline_parser.add_argument( - '-d', - '--delay', - default=0, - type=int, - help='Delay the pipeline by x days', + pipeline_subparsers.add_parser( + ACTIVATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + pipeline_run_options, + pipeline_definition_parser, + ], + help='Activate the pipeline on AWS', ) - pipeline_parser.add_argument( - '--activities_only', - action='store_true', - help='Visualize only activities', + pipeline_subparsers.add_parser( + VISUALIZE, + formatter_class=formatter_class, + parents=[ + mode_parser, + file_parser, + pipeline_definition_parser, + ], + help='Visualize the pipeline', + ) + + # Config parser + config_parser = subparsers.add_parser( + CONFIG, + formatter_class=formatter_class, + add_help=False, + parents=[help_parser] + ) + config_subparsers = config_parser.add_subparsers( + dest='action', + help='config actions', ) - # Database parser declaration - database_choices = { - CREATE_STR: 'Create tables', - DROP_STR: 'Drop views and tables', - GRANT_STR: 'Grant permissions to neccessary groups', - RECREATE_STR: 'Recreate tables, load new data, drop old tables', - VISUALIZE_STR: 'Visualize the database er-diagram', - } + # config subparsers_action + config_subparsers.add_parser( + CONFIG_TO_S3, + formatter_class=formatter_class, + parents=[ + mode_parser, + ], + help='sync config file from local to s3', + ) + config_subparsers.add_parser( + CONFIG_FROM_S3, + formatter_class=formatter_class, + parents=[ + mode_parser, + file_parser, + ], + help='sync config file from s3 to local file', + ) + + # Database parser database_parser = subparsers.add_parser( - DATABASE_COMMAND, - parents=[base_parser], + DATABASE, formatter_class=formatter_class, - help='Command for various operations on the database', + add_help=False, + parents=[help_parser] ) - database_parser.add_argument( - 'action', - type=str, - choices=database_choices, - help=choice_help_formatter(database_choices), + database_subparsers = database_parser.add_subparsers( + dest='action', + help='database actions', ) - database_parser.add_argument( - 'table_definitions', - nargs='+', - help='Enter the paths of the table definitions', + + # database subparsers_action + database_subparsers.add_parser( + CREATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Create tables', + ) + database_subparsers.add_parser( + DROP, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Drop views and tables', + ) + database_subparsers.add_parser( + GRANT, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Grant permissions to neccessary groups', + ) + database_subparsers.add_parser( + RECREATE, + formatter_class=formatter_class, + parents=[ + mode_parser, + table_definition_parser, + ], + help='Recreate tables, load new data, drop old tables', + ) + database_subparsers.add_parser( + VISUALIZE, + formatter_class=formatter_class, + parents=[ + mode_parser, + file_parser, + table_definition_parser, + ], + help='Visualize the database er-diagram', ) # Check if autocomplete is possible @@ -292,15 +395,15 @@ def main(): # Certain modes in the config can override frequency of a pipeline frequency_override = config.etl.get('FREQUENCY_OVERRIDE', None) + arg_vars = vars(args) + # Action parse - if args.command == CONFIG_COMMAND: - config_actions(args.action, args.filename) - elif args.command == PIPELINE_COMMAND: - pipeline_actions(args.action, args.load_definitions, - args.force_overwrite, args.delay, frequency_override, - args.activities_only, args.filename) - elif args.command == DATABASE_COMMAND: - database_actions(args.action, args.table_definitions, args.filename) + if args.command == CONFIG: + config_actions(**arg_vars) + elif args.command == PIPELINE: + pipeline_actions(frequency_override=frequency_override, **arg_vars) + elif args.command == DATABASE: + database_actions(**arg_vars) else: raise ValueError('Unknown argument provided, use dataduct') diff --git a/dataduct/config/config_actions.py b/dataduct/config/config_actions.py index 709c236..c9fcee3 100644 --- a/dataduct/config/config_actions.py +++ b/dataduct/config/config_actions.py @@ -32,7 +32,7 @@ def sync_from_s3(filename): text = s3_file.text if filename is None: - print text + raise ValueError('Filename for config sync must be provided') else: with open(filename, 'w') as op_file: op_file.write(text) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 4264701..6ee8784 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -59,14 +59,14 @@ def create_pipeline(definition): return etl -def validate_pipeline(etl, force_overwrite=False): +def validate_pipeline(etl, force=False): """Validates the pipeline that was created Args: etl(EtlPipeline): pipeline object that needs to be validated - force_overwrite(bool): delete if a pipeline of same name exists + force(bool): delete if a pipeline of same name exists """ - if force_overwrite: + if force: etl.delete_if_exists() etl.validate() logger.debug(yaml.dump(etl.pipeline.aws_format)) From 410cc8678d29d3a9776352a165aa7b0cfc61ba22 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 2 Mar 2015 09:45:28 -0800 Subject: [PATCH 137/175] activities only added --- bin/dataduct | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/bin/dataduct b/bin/dataduct index 7f6dc4a..3f552ff 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -263,7 +263,7 @@ def main(): ], help='Activate the pipeline on AWS', ) - pipeline_subparsers.add_parser( + visualize_pipeline_parser = pipeline_subparsers.add_parser( VISUALIZE, formatter_class=formatter_class, parents=[ @@ -273,6 +273,12 @@ def main(): ], help='Visualize the pipeline', ) + visualize_pipeline_parser.add_argument( + '--activities_only', + action='store_true', + default=False, + help='Visualize only activities', + ) # Config parser config_parser = subparsers.add_parser( From 132893e6d66032df3072e517ab62f7a818a687b2 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 2 Mar 2015 15:41:11 -0800 Subject: [PATCH 138/175] remove from setup.py --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 993f35b..138d2c1 100644 --- a/setup.py +++ b/setup.py @@ -25,8 +25,7 @@ 'MySQL-python', 'pyparsing', 'testfixtures', - 'sphinx_rtd_theme', - 'argcomplete' + 'sphinx_rtd_theme' ], scripts=['bin/dataduct'], classifiers=[ From c83464dce391f2646afb6b6ede990089c9d07b85 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Mon, 2 Mar 2015 18:03:21 -0800 Subject: [PATCH 139/175] Encode strings in ColumnCheck in utf-8 --- dataduct/steps/scripts/column_check_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataduct/steps/scripts/column_check_test.py b/dataduct/steps/scripts/column_check_test.py index 9f3187b..3ea5264 100644 --- a/dataduct/steps/scripts/column_check_test.py +++ b/dataduct/steps/scripts/column_check_test.py @@ -114,11 +114,11 @@ def main(): # Open up a connection and read the source and destination tables source_data = _get_source_data(args.source_sql, args.source_host, args.sample_size) - print source_data.to_string() + print source_data.to_string().encode('utf-8') destination_data = _get_destination_data(args.destination_sql, list(source_data.index)) - print destination_data.to_string() + print destination_data.to_string().encode('utf-8') check = ColumnCheck(source_data, destination_data, name=args.test_name, From 7738c045dcf4ec6ce22655effd640cffad4d90b8 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Mon, 2 Mar 2015 20:20:02 -0800 Subject: [PATCH 140/175] Put the script filename through parse_path first --- dataduct/steps/upsert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/steps/upsert.py b/dataduct/steps/upsert.py index b24f8a9..c921553 100644 --- a/dataduct/steps/upsert.py +++ b/dataduct/steps/upsert.py @@ -32,7 +32,7 @@ def __init__(self, destination, redshift_database, sql=None, source_relation = Table(SqlScript(filename=parse_path(source))) else: source_relation = SelectStatement( - SqlScript(sql=sql, filename=script).sql()) + SqlScript(sql=sql, filename=parse_path(script)).sql()) # Create the destination table if doesn't exist script = dest.exists_clone_script() From c5e07a43cccb946bba6a369e75dc4af98c43cebf Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Mon, 2 Mar 2015 21:27:04 -0800 Subject: [PATCH 141/175] Actually set the variable to be a connection --- dataduct/pipeline/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index 1af1f0e..4507a5c 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -114,7 +114,7 @@ def list_pipeline_instances(pipeline_id, conn=None, increment=25): instances(list): list of pipeline instances """ if conn is None: - get_datapipeline_connection() + conn = get_datapipeline_connection() # Get all instances instance_ids = sorted(get_list_from_boto(conn.query_objects, From 0d06c3bdd6f47d017a88bf6cb70a25dd4c74cf25 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 3 Mar 2015 00:10:34 -0800 Subject: [PATCH 142/175] name prefix and dependency_override --- dataduct/steps/pipeline_dependencies.py | 39 +++++++++++++++++-------- 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/dataduct/steps/pipeline_dependencies.py b/dataduct/steps/pipeline_dependencies.py index ab55e3e..b0dd57b 100644 --- a/dataduct/steps/pipeline_dependencies.py +++ b/dataduct/steps/pipeline_dependencies.py @@ -5,6 +5,11 @@ from .transform import TransformStep from ..utils import constants as const +from ..config import Config + +config = Config() +NAME_PREFIX = config.etl.get('NAME_PREFIX', '') +DEPENDENCY_OVERRIDE = config.etl.get('DEPENDENCY_OVERRIDE', False) class PipelineDependenciesStep(TransformStep): @@ -33,24 +38,34 @@ def __init__(self, if dependent_pipelines is None: raise ValueError('Must have some dependencies for dependency step') - if start_date is None: - start_date = "#{format(@scheduledStartTime,'YYYY-MM-dd')}" + if DEPENDENCY_OVERRIDE: + command = 'ls' + script = None + script_arguments = None + else: + command = None + if start_date is None: + start_date = "#{format(@scheduledStartTime,'YYYY-MM-dd')}" - script_arguments.extend( - [ - '--start_date=%s' % start_date, - '--refresh_rate=%s' % str(refresh_rate), - '--dependencies', - ] - ) - script_arguments.extend(dependent_pipelines) + script_arguments.extend( + [ + '--start_date=%s' % start_date, + '--refresh_rate=%s' % str(refresh_rate), + '--dependencies', + ] + ) + script_arguments.extend([ + pipeline if not NAME_PREFIX else NAME_PREFIX + '_' + pipeline + for pipeline in dependent_pipelines + ]) - steps_path = os.path.abspath(os.path.dirname(__file__)) - script = os.path.join(steps_path, const.DEPENDENCY_SCRIPT_PATH) + steps_path = os.path.abspath(os.path.dirname(__file__)) + script = os.path.join(steps_path, const.DEPENDENCY_SCRIPT_PATH) super(PipelineDependenciesStep, self).__init__( id=id, script=script, + command=command, script_arguments=script_arguments, **kwargs) From 48b1d1b9da2d1be3d252ca845039daf334ec4966 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 5 Mar 2015 00:14:25 -0800 Subject: [PATCH 143/175] Backfill and force frequency --- bin/dataduct | 57 +++++++++++++++++------------ dataduct/etl/etl_pipeline.py | 8 ++-- dataduct/pipeline/default_object.py | 2 + dataduct/pipeline/schedule.py | 24 +++++++----- dataduct/tests/test_import.py | 7 ++++ requirements.txt | 1 + setup.py | 1 + 7 files changed, 63 insertions(+), 37 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index 3f552ff..329e688 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -6,6 +6,8 @@ import argparse from argparse import ArgumentParser from argparse import RawTextHelpFormatter +from pytimeparse import parse +from datetime import timedelta from dataduct.config import Config from dataduct.config import logger_configuration @@ -28,11 +30,9 @@ DROP = 'drop' GRANT = 'grant' RECREATE = 'recreate' -DEV = 'dev' - -def initialize_etl_objects(pipeline_definitions, delay=None, - frequency_override=None): +def initialize_etl_objects(pipeline_definitions, time_delta=None, + frequency_override=None, backfill=False): """Generate etl objects from yaml files """ from dataduct.etl import create_pipeline @@ -41,8 +41,11 @@ def initialize_etl_objects(pipeline_definitions, delay=None, etls = [] for pipeline_definition in pipeline_definitions: definition = read_pipeline_definition(pipeline_definition) - if delay is not None: - definition.update({'delay': delay}) + if time_delta is not None: + time_delta = timedelta(seconds=parse(time_delta)) + if backfill: + time_delta *= -1 + definition.update({'time_delta': time_delta}) if frequency_override is not None: definition.update({'frequency': frequency_override}) etls.append(create_pipeline(definition)) @@ -60,17 +63,17 @@ def config_actions(action, filename=None, **kwargs): return sync_from_s3(filename) -def pipeline_actions(action, pipeline_definitions, force=None, delay=None, +def pipeline_actions(action, pipeline_definitions, force=None, time_delta=None, frequency_override=None, activities_only=None, - filename=None, **kwargs): + filename=None, backfill=False, **kwargs): """Pipeline related actions are executed in this block """ from dataduct.etl import activate_pipeline from dataduct.etl import validate_pipeline from dataduct.etl import visualize_pipeline - for etl in initialize_etl_objects(pipeline_definitions, delay, - frequency_override): + for etl in initialize_etl_objects(pipeline_definitions, time_delta, + frequency_override, backfill): if action in [VALIDATE, ACTIVATE]: validate_pipeline(etl, force) if action == ACTIVATE: @@ -166,11 +169,22 @@ def main(): help='Indicates that if this pipeline exists, it will be destroyed', ) pipeline_run_options.add_argument( - '-d', - '--delay', - default=0, - type=int, - help='Delay the pipeline by x days', + '-t', + '--time_delta', + default='0h', + help='Timedelta the pipeline by x time difference', + ) + pipeline_run_options.add_argument( + '-b', + '--backfill', + action='store_true', + default=False, + help='Indicates that the timedelta supplied is for a backfill', + ) + pipeline_run_options.add_argument( + '--frequency', + default=None, + help='Frequency override to the pipeline', ) # Pipeline definitions parser @@ -378,13 +392,7 @@ def main(): except ImportError: pass args = parser.parse_args() - - # Check if mode is dev mode = args.mode - if mode is not None: - # We assume mode:dev = mode:None - if mode == DEV: - mode = None # To instantiate the singleton object with the correct state # As this is the single entry point to the library @@ -398,8 +406,11 @@ def main(): logger.warning('Running the pipeline in %s mode.' % config.mode) # Frequency override - # Certain modes in the config can override frequency of a pipeline - frequency_override = config.etl.get('FREQUENCY_OVERRIDE', None) + if hasattr(args, 'frequency') and args.frequency is not None: + frequency_override = args.frequency + else: + # Certain modes in the config can override frequency of a pipeline + frequency_override = config.etl.get('FREQUENCY_OVERRIDE', None) arg_vars = vars(args) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index cf86f2d..105c294 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -49,7 +49,7 @@ class ETLPipeline(object): """ def __init__(self, name, frequency='one-time', ec2_resource_config=None, - delay=0, emr_cluster_config=None, load_time=None, + time_delta='0h', emr_cluster_config=None, load_time=None, topic_arn=None, max_retries=MAX_RETRIES, bootstrap=None, description=None): """Constructor for the pipeline class @@ -57,7 +57,7 @@ def __init__(self, name, frequency='one-time', ec2_resource_config=None, Args: name (str): Name of the pipeline should be globally unique. frequency (enum): Frequency of the pipeline. Can be - delay(int): Number of days to delay the pipeline by + time_delta(timedelta): Duration to change the start time by emr_cluster_config(dict): Dictionary for emr config topic_arn(str): sns alert to be used by the pipeline max_retries(int): number of retries for pipeline activities @@ -76,7 +76,7 @@ def __init__(self, name, frequency='one-time', ec2_resource_config=None, self.frequency = frequency self.load_hour = load_hour self.load_min = load_min - self.delay = delay + self.time_delta = time_delta self.description = description self.max_retries = max_retries self.topic_arn = topic_arn @@ -162,7 +162,7 @@ def create_base_objects(self): self.schedule = self.create_pipeline_object( object_class=Schedule, frequency=self.frequency, - delay=self.delay, + time_delta=self.time_delta, load_hour=self.load_hour, load_min=self.load_min, ) diff --git a/dataduct/pipeline/default_object.py b/dataduct/pipeline/default_object.py index a42823b..b7bc17a 100644 --- a/dataduct/pipeline/default_object.py +++ b/dataduct/pipeline/default_object.py @@ -8,6 +8,7 @@ config = Config() ROLE = config.etl['ROLE'] RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] +MAX_ACTIVE_INSTANCES = config.etl.get('MAX_ACTIVE_INSTANCES', 1) class DefaultObject(PipelineObject): @@ -35,6 +36,7 @@ def __init__(self, id, pipeline_log_uri, sns=None, scheduleType='cron', failureAndRerunMode=failureAndRerunMode, role=ROLE, resourceRole=RESOURCE_ROLE, + maxActiveInstances=MAX_ACTIVE_INSTANCES, pipelineLogUri=pipeline_log_uri, onFail=sns ) diff --git a/dataduct/pipeline/schedule.py b/dataduct/pipeline/schedule.py index b50c260..6fdd0c7 100644 --- a/dataduct/pipeline/schedule.py +++ b/dataduct/pipeline/schedule.py @@ -3,6 +3,7 @@ """ from datetime import datetime from datetime import timedelta +from pytimeparse import parse from ..config import Config from .pipeline_object import PipelineObject @@ -28,7 +29,7 @@ class Schedule(PipelineObject): def __init__(self, id, frequency='one-time', - delay=None, + time_delta=None, load_hour=None, load_minutes=None, **kwargs): @@ -38,7 +39,7 @@ def __init__(self, id(str): id of the Schedule object frequency(enum): rate at which pipeline should be run \ can be daily, hourly and one-time - delay(timedelta): Additional offset provided to the schedule + time_delta(timedelta): Additional offset provided to the schedule load_hour(int): Hour at which the pipeline should start load_minutes(int): Minutes at which the pipeline should be run **kwargs(optional): Keyword arguments directly passed to base class @@ -52,12 +53,12 @@ def __init__(self, if load_hour is None: load_hour = DAILY_LOAD_TIME - if delay is None: - delay = timedelta(0) - elif isinstance(delay, int): - delay = timedelta(days=delay) - elif not isinstance(delay, timedelta): - raise ETLInputError('Delay must be an instance of timedelta or int') + if time_delta is None: + time_delta = timedelta(seconds=0) + elif isinstance(time_delta, int): + time_delta = timedelta(days=time_delta) + elif not isinstance(time_delta, timedelta): + raise ETLInputError('time_delta must be an instance of timedelta or int') if frequency in FEQUENCY_PERIOD_CONVERTION: period, occurrences = FEQUENCY_PERIOD_CONVERTION[frequency] @@ -71,9 +72,12 @@ def __init__(self, start_time = start_time.replace(hour=load_hour) if current_time.hour < load_hour: - delay += timedelta(days=-1) + if frequency == 'one-time': + time_delta -= timedelta(days=1) + else: + time_delta -= timedelta(seconds=parse(period)) - start_time += delay + start_time += time_delta super(Schedule, self).__init__( id=id, diff --git a/dataduct/tests/test_import.py b/dataduct/tests/test_import.py index 92e1bcf..c355164 100644 --- a/dataduct/tests/test_import.py +++ b/dataduct/tests/test_import.py @@ -76,3 +76,10 @@ def test_testfixtures(): """ print 'Trying to import testfixtures' import testfixtures + + @staticmethod + def test_pytimeparse(): + """Testing pytimeparse + """ + print 'Trying to import pytimeparse' + import pytimeparse diff --git a/requirements.txt b/requirements.txt index e42b6eb..6021e82 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,3 +11,4 @@ pyparsing>=2 pygraphviz testfixtures>=4.1.1 mock +pytimeparse diff --git a/setup.py b/setup.py index 138d2c1..6422cf5 100644 --- a/setup.py +++ b/setup.py @@ -22,6 +22,7 @@ 'PyYAML', 'pandas', 'psycopg2', + 'pytimeparse' 'MySQL-python', 'pyparsing', 'testfixtures', From 7b0babd8e8279e2c93ffe4259db7c9ed469439e1 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 5 Mar 2015 03:05:02 -0800 Subject: [PATCH 144/175] With query fix --- dataduct/database/parsers/select_query.py | 8 ++++++++ dataduct/database/parsers/tests/test_select_query.py | 9 +++++++++ dataduct/database/parsers/utils.py | 1 + setup.py | 2 +- 4 files changed, 19 insertions(+), 1 deletion(-) diff --git a/dataduct/database/parsers/select_query.py b/dataduct/database/parsers/select_query.py index 238d5e6..e4623c2 100644 --- a/dataduct/database/parsers/select_query.py +++ b/dataduct/database/parsers/select_query.py @@ -7,11 +7,15 @@ from pyparsing import Word from pyparsing import WordStart from pyparsing import ParseException +from pyparsing import Optional +from .utils import _as from .utils import _db_name from .utils import _from from .utils import _join from .utils import _select +from .utils import _with +from .utils import subquery from .utils import field_parser @@ -83,6 +87,10 @@ def parse_select_columns(string): if string == '': return list() + if string.upper().startswith('WITH'): + suppressor = _with + delimitedList(_db_name + _as + subquery) + string = suppressor.suppress().transformString(string) + # Supress everything after the first from suppressor = MatchFirst(_from) + restOfLine string = suppressor.suppress().transformString(string) diff --git a/dataduct/database/parsers/tests/test_select_query.py b/dataduct/database/parsers/tests/test_select_query.py index 18e9ce9..ebf2171 100644 --- a/dataduct/database/parsers/tests/test_select_query.py +++ b/dataduct/database/parsers/tests/test_select_query.py @@ -59,3 +59,12 @@ def test_columns(): columns = parse_select_columns(query) eq_(columns, result) + + @staticmethod + def test_with_query(): + """Basic test for select statement with the with query + """ + query = ('WITH data AS (SELECT x, y FROM xy) SELECT x,y FROM data') + + columns = parse_select_columns(query) + eq_(columns, ['x', 'y']) diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index b260849..7a66635 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -47,6 +47,7 @@ # Select SQL Keywords _select = CaselessKeyword('SELECT') +_with = CaselessKeyword('WITH') _from = CaselessKeyword('FROM') _as = CaselessKeyword('AS') _join = CaselessKeyword('JOIN') diff --git a/setup.py b/setup.py index 6422cf5..02bd8d1 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ 'PyYAML', 'pandas', 'psycopg2', - 'pytimeparse' + 'pytimeparse', 'MySQL-python', 'pyparsing', 'testfixtures', From b3803ef2b3a698d96d11745de7be77c3936629ff Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 5 Mar 2015 11:02:59 -0800 Subject: [PATCH 145/175] fix test --- dataduct/etl/etl_pipeline.py | 6 +++++- dataduct/etl/tests/test_etl_pipeline.py | 5 +++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index 105c294..cbda9b8 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -2,6 +2,7 @@ Class definition for DataPipeline """ from datetime import datetime +from datetime import timedelta import csv import os from StringIO import StringIO @@ -49,7 +50,7 @@ class ETLPipeline(object): """ def __init__(self, name, frequency='one-time', ec2_resource_config=None, - time_delta='0h', emr_cluster_config=None, load_time=None, + time_delta=None, emr_cluster_config=None, load_time=None, topic_arn=None, max_retries=MAX_RETRIES, bootstrap=None, description=None): """Constructor for the pipeline class @@ -71,6 +72,9 @@ def __init__(self, name, frequency='one-time', ec2_resource_config=None, else: load_hour, load_min = [None, None] + if time_delta is None: + time_delta = timedelta(seconds=0) + # Input variables self._name = name if not NAME_PREFIX else NAME_PREFIX + '_' + name self.frequency = frequency diff --git a/dataduct/etl/tests/test_etl_pipeline.py b/dataduct/etl/tests/test_etl_pipeline.py index 64fd270..a40c16c 100644 --- a/dataduct/etl/tests/test_etl_pipeline.py +++ b/dataduct/etl/tests/test_etl_pipeline.py @@ -4,6 +4,7 @@ from nose.tools import raises from nose.tools import eq_ +from datetime import timedelta from ..etl_pipeline import ETLPipeline from ...utils.exceptions import ETLInputError @@ -25,7 +26,7 @@ def test_construct_etl_pipeline(): 'test_pipeline', frequency='one-time', ec2_resource_config={'terminate_after':'2 Hours'}, - delay=13, + time_delta=timedelta(seconds=3600), emr_cluster_config={'cfg1': 'value'}, load_time='12:34', topic_arn='sns:topic-arn:test-case', @@ -37,7 +38,7 @@ def test_construct_etl_pipeline(): eq_(result.ec2_resource_config, {'terminate_after':'2 Hours'}) eq_(result.load_hour, 12) eq_(result.load_min, 34) - eq_(result.delay, 13) + eq_(result.time_delta, timedelta(seconds=3600)) eq_(result.max_retries, 5) eq_(result.topic_arn, 'sns:topic-arn:test-case') eq_(result.bootstrap_definitions, {'cfg1': 'value'}) From 5da6f5a55359c6f5c6b22cd78823c09c9214b189 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 5 Mar 2015 11:43:11 -0800 Subject: [PATCH 146/175] use constant --- dataduct/pipeline/default_object.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataduct/pipeline/default_object.py b/dataduct/pipeline/default_object.py index b7bc17a..53d10e8 100644 --- a/dataduct/pipeline/default_object.py +++ b/dataduct/pipeline/default_object.py @@ -4,11 +4,12 @@ from .pipeline_object import PipelineObject from ..config import Config +from ..utils import constants as const config = Config() ROLE = config.etl['ROLE'] RESOURCE_ROLE = config.etl['RESOURCE_ROLE'] -MAX_ACTIVE_INSTANCES = config.etl.get('MAX_ACTIVE_INSTANCES', 1) +MAX_ACTIVE_INSTANCES = config.etl.get('MAX_ACTIVE_INSTANCES', const.ONE) class DefaultObject(PipelineObject): From 5e4cd69586f8e293031f9d2d84b33db77261a20c Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Thu, 5 Mar 2015 14:03:09 -0800 Subject: [PATCH 147/175] Fix double quotes issue with replace_invalid_char --- dataduct/steps/scripts/create_load_redshift_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index f96b05c..159932d 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -27,7 +27,7 @@ def load_redshift(table, input_paths, max_error=0, delete_statement = 'DELETE FROM %s;' % table_name error_string = 'MAXERROR %d' % max_error if max_error > 0 else '' if replace_invalid_char is not None: - invalid_char_str = "ACCEPTINVCHARS AS '%s'" % replace_invalid_char + invalid_char_str = "ACCEPTINVCHARS AS %s" % replace_invalid_char else: invalid_char_str = '' From 7cae5b7e91fbaa20b595ee8da5fabba84a6830f7 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 5 Mar 2015 14:12:14 -0800 Subject: [PATCH 148/175] Temp --- dataduct/s3/s3_log_path.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataduct/s3/s3_log_path.py b/dataduct/s3/s3_log_path.py index f8b38fc..eabfa56 100644 --- a/dataduct/s3/s3_log_path.py +++ b/dataduct/s3/s3_log_path.py @@ -14,8 +14,8 @@ class S3LogPath(S3Path): unless there is a backslash: :: - s3:://coursera-datapipeline/dev - s3:://coursera-datapipeline/dev_log_dir + s3:://coursera-bucket/dev + s3:://coursera-bucket/dev_log_dir However, if one adds a backslash to the log s3 URI, Data Pipeline will add another backslash before adding subdirectories. These From 433d9c9f0de63ae2621b6a4187572e31f0b74af5 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Fri, 27 Feb 2015 11:32:46 +0900 Subject: [PATCH 149/175] Fixed monitoring pipeline url when the region used --- dataduct/etl/etl_actions.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 6ee8784..788342c 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -7,6 +7,7 @@ from ..pipeline import MysqlNode from ..pipeline import RedshiftNode from ..pipeline import S3Node +from ..config import Config from ..utils.exceptions import ETLInputError from ..utils.slack_hook import post_message @@ -14,7 +15,11 @@ logger = logging.getLogger(__name__) +config = Config() +REGION = config.etl.get('REGION', None) URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa +if REGION: + URL_TEMPLATE = "https://console.aws.amazon.com/datapipeline/?®ion=%s#ExecutionDetailsPlace:pipelineId={ID}&show=latest" % REGION def read_pipeline_definition(file_path): From c9396853ae978ce6ba2bc766a613de53820e504b Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Sat, 7 Mar 2015 15:52:34 +0900 Subject: [PATCH 150/175] Changes expression --- dataduct/etl/etl_actions.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index 788342c..be1e547 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -17,10 +17,8 @@ config = Config() REGION = config.etl.get('REGION', None) -URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa -if REGION: - URL_TEMPLATE = "https://console.aws.amazon.com/datapipeline/?®ion=%s#ExecutionDetailsPlace:pipelineId={ID}&show=latest" % REGION - +URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest%s' # noqa +URL_TEMPLATE %= 'region=%s' % REGION if REGION is not None else '' def read_pipeline_definition(file_path): """Function reads the yaml pipeline definitions. From 0d9204c82e8e214eecf7459cb6d4f88a3177f782 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Fri, 27 Feb 2015 11:34:59 +0900 Subject: [PATCH 151/175] Expanded command options on create-load-redshift step --- .../scripts/create_load_redshift_runner.py | 39 ++++++++++++------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index 159932d..cf6ab06 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -11,7 +11,7 @@ def load_redshift(table, input_paths, max_error=0, - replace_invalid_char=None, no_escape=False, gzip=False): + replace_invalid_char=None, no_escape=False, gzip=False, command_options=None): """Load redshift table with the data in the input s3 paths """ table_name = table.full_name @@ -34,17 +34,27 @@ def load_redshift(table, input_paths, max_error=0, query = [delete_statement] for input_path in input_paths: - statement = ( - "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' " - "DELIMITER '\t' {escape} {gzip} NULL AS 'NULL' TRUNCATECOLUMNS " - "{max_error} {invalid_char_str};" - ).format(table=table_name, - path=input_path, - creds=creds, - escape='ESCAPE' if not no_escape else '', - gzip='GZIP' if gzip else '', - max_error=error_string, - invalid_char_str=invalid_char_str) + if command_options: + statement = ( + "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' " + "{command_options};" + ).format(table=table_name, + path=input_path, + creds=creds, + command_options=command_options) + else: + statement = ( + "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' " + "DELIMITER '\t' {escape} {gzip} NULL AS 'NULL' TRUNCATECOLUMNS " + "{max_error} {invalid_char_str};" + ).format(table=table_name, + path=input_path, + creds=creds, + escape='ESCAPE' if not no_escape else '', + gzip='GZIP' if gzip else '', + max_error=error_string, + invalid_char_str=invalid_char_str) + query.append(statement) return ' '.join(query) @@ -60,6 +70,7 @@ def main(): default=None) parser.add_argument('--no_escape', action='store_true', default=False) parser.add_argument('--gzip', action='store_true', default=False) + parser.add_argument('--command_options', dest='command_options', default=None) parser.add_argument('--s3_input_paths', dest='input_paths', nargs='+') args = parser.parse_args() print args @@ -75,8 +86,8 @@ def main(): # Load data into redshift load_query = load_redshift(table, args.input_paths, args.max_error, args.replace_invalid_char, args.no_escape, - args.gzip) - + args.gzip, args.command_options) + print load_query cursor.execute(load_query) cursor.execute('COMMIT') cursor.close() From 58d91b0cc26e0076f2e1e1fd3e1b808b69b66224 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Sat, 7 Mar 2015 17:01:22 +0900 Subject: [PATCH 152/175] Changes expression --- .../scripts/create_load_redshift_runner.py | 30 ++++++++----------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index cf6ab06..c232e02 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -11,7 +11,8 @@ def load_redshift(table, input_paths, max_error=0, - replace_invalid_char=None, no_escape=False, gzip=False, command_options=None): + replace_invalid_char=None, no_escape=False, gzip=False, + command_options=None): """Load redshift table with the data in the input s3 paths """ table_name = table.full_name @@ -33,29 +34,25 @@ def load_redshift(table, input_paths, max_error=0, query = [delete_statement] + template = \ + "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' {options};" + for input_path in input_paths: - if command_options: - statement = ( - "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' " - "{command_options};" - ).format(table=table_name, - path=input_path, - creds=creds, - command_options=command_options) - else: - statement = ( - "COPY {table} FROM '{path}' WITH CREDENTIALS AS '{creds}' " + if not command_options: + command_options = ( "DELIMITER '\t' {escape} {gzip} NULL AS 'NULL' TRUNCATECOLUMNS " "{max_error} {invalid_char_str};" - ).format(table=table_name, - path=input_path, - creds=creds, - escape='ESCAPE' if not no_escape else '', + ).format(escape='ESCAPE' if not no_escape else '', gzip='GZIP' if gzip else '', max_error=error_string, invalid_char_str=invalid_char_str) + statement = template.format(table=table_name, + path=input_path, + creds=creds, + options=command_options) query.append(statement) + return ' '.join(query) @@ -87,7 +84,6 @@ def main(): load_query = load_redshift(table, args.input_paths, args.max_error, args.replace_invalid_char, args.no_escape, args.gzip, args.command_options) - print load_query cursor.execute(load_query) cursor.execute('COMMIT') cursor.close() From ab20e79bc451f9e81fe315ebdc597dd5b1c4dda8 Mon Sep 17 00:00:00 2001 From: Sungju Jin Date: Sat, 7 Mar 2015 20:39:53 +0900 Subject: [PATCH 153/175] Fixed monitoring url --- dataduct/etl/etl_actions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/etl/etl_actions.py b/dataduct/etl/etl_actions.py index be1e547..47c6148 100644 --- a/dataduct/etl/etl_actions.py +++ b/dataduct/etl/etl_actions.py @@ -17,7 +17,7 @@ config = Config() REGION = config.etl.get('REGION', None) -URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?#ExecutionDetailsPlace:pipelineId={ID}&show=latest%s' # noqa +URL_TEMPLATE = 'https://console.aws.amazon.com/datapipeline/?%s#ExecutionDetailsPlace:pipelineId={ID}&show=latest' # noqa URL_TEMPLATE %= 'region=%s' % REGION if REGION is not None else '' def read_pipeline_definition(file_path): From 86414b1b2747295a095850cba86b70c97f702664 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 7 Mar 2015 21:36:55 -0800 Subject: [PATCH 154/175] QA steps don't create an output node --- dataduct/steps/qa_transform.py | 1 + dataduct/steps/transform.py | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/dataduct/steps/qa_transform.py b/dataduct/steps/qa_transform.py index fa2fea6..c9f7a52 100644 --- a/dataduct/steps/qa_transform.py +++ b/dataduct/steps/qa_transform.py @@ -38,6 +38,7 @@ def __init__(self, super(QATransformStep, self).__init__( id=id, script_arguments=script_arguments, + no_output=True, **kwargs) @classmethod diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 5537fce..849049d 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -32,6 +32,7 @@ def __init__(self, script_arguments=None, additional_s3_files=None, output_path=None, + no_output=False, **kwargs): """Constructor for the TransformStep class @@ -51,9 +52,12 @@ def __init__(self, raise ETLInputError( 'Only one of script, command and directory allowed') - # Create output_node based on output_path - base_output_node = self.create_s3_data_node( - self.get_output_s3_path(output_path)) + if not no_output: + # Create output_node based on output_path + base_output_node = self.create_s3_data_node( + self.get_output_s3_path(output_path)) + else: + base_output_node = None script_arguments = self.translate_arguments(script_arguments) From 437051c1044c79efc7912e7671dbfe1b5e159510 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sat, 7 Mar 2015 22:59:07 -0800 Subject: [PATCH 155/175] PR comments --- dataduct/steps/transform.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index 849049d..f34f8b4 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -52,12 +52,11 @@ def __init__(self, raise ETLInputError( 'Only one of script, command and directory allowed') + base_output_node = None if not no_output: # Create output_node based on output_path base_output_node = self.create_s3_data_node( self.get_output_s3_path(output_path)) - else: - base_output_node = None script_arguments = self.translate_arguments(script_arguments) From 81e428ef115fbc8a554551129840d3ac99222b4d Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 8 Mar 2015 04:13:51 -0700 Subject: [PATCH 156/175] create and update with sql step --- dataduct/etl/utils.py | 1 + dataduct/steps/__init__.py | 17 ++++---- dataduct/steps/create_update_sql.py | 52 +++++++++++++++++++++++++ dataduct/steps/pipeline_dependencies.py | 1 + dataduct/steps/sql_command.py | 15 +++++-- dataduct/steps/upsert.py | 8 +++- examples/example_create_update_sql.yaml | 13 +++++++ 7 files changed, 95 insertions(+), 12 deletions(-) create mode 100644 dataduct/steps/create_update_sql.py create mode 100644 examples/example_create_update_sql.yaml diff --git a/dataduct/etl/utils.py b/dataduct/etl/utils.py index 08f53dd..8cdeefc 100644 --- a/dataduct/etl/utils.py +++ b/dataduct/etl/utils.py @@ -10,6 +10,7 @@ 'column-check': ColumnCheckStep, 'count-check': CountCheckStep, 'create-load-redshift': CreateAndLoadStep, + 'create-update-sql': CreateUpdateSqlStep, 'emr-step': EMRJobStep, 'emr-streaming': EMRStreamingStep, 'extract-local': ExtractLocalStep, diff --git a/dataduct/steps/__init__.py b/dataduct/steps/__init__.py index db675e8..282d020 100644 --- a/dataduct/steps/__init__.py +++ b/dataduct/steps/__init__.py @@ -1,18 +1,19 @@ -from .etl_step import ETLStep -from .emr_streaming import EMRStreamingStep +from .column_check import ColumnCheckStep +from .count_check import CountCheckStep +from .create_load_redshift import CreateAndLoadStep +from .create_update_sql import CreateUpdateSqlStep from .emr_job import EMRJobStep +from .emr_streaming import EMRStreamingStep +from .etl_step import ETLStep from .extract_local import ExtractLocalStep from .extract_rds import ExtractRdsStep from .extract_redshift import ExtractRedshiftStep from .extract_s3 import ExtractS3Step from .load_redshift import LoadRedshiftStep from .pipeline_dependencies import PipelineDependenciesStep +from .primary_key_check import PrimaryKeyCheckStep +from .qa_transform import QATransformStep +from .reload import ReloadStep from .sql_command import SqlCommandStep from .transform import TransformStep -from .qa_transform import QATransformStep -from .primary_key_check import PrimaryKeyCheckStep -from .count_check import CountCheckStep -from .column_check import ColumnCheckStep -from .create_load_redshift import CreateAndLoadStep from .upsert import UpsertStep -from .reload import ReloadStep diff --git a/dataduct/steps/create_update_sql.py b/dataduct/steps/create_update_sql.py new file mode 100644 index 0000000..1d7751d --- /dev/null +++ b/dataduct/steps/create_update_sql.py @@ -0,0 +1,52 @@ +""" +ETL step wrapper for sql command for inserting into tables +""" +from .sql_command import SqlCommandStep +from ..database import SqlScript +from ..database import Table +from ..utils.helpers import exactly_one +from ..utils.helpers import parse_path +from ..utils.exceptions import ETLInputError + + +class CreateUpdateSqlStep(SqlCommandStep): + """Create and Insert step that creates a table and then uses the query to + update the table data with any sql query provided + """ + + def __init__(self, + table_definition, + script=None, + command=None, + analyze_table=True, + wrap_transaction=True, + **kwargs): + """Constructor for the CreateUpdateStep class + + Args: + **kwargs(optional): Keyword arguments directly passed to base class + """ + if not exactly_one(command, script): + raise ETLInputError('Both command or script found') + + # Create S3File with script / command provided + if script: + update_script = SqlScript(filename=parse_path(script)) + else: + update_script = SqlScript(command) + + dest = Table(SqlScript(filename=parse_path(table_definition))) + + sql_script = dest.exists_clone_script() + sql_script.append(dest.grant_script()) + sql_script.append(update_script) + + if wrap_transaction: + sql_script = sql_script.wrap_transaction() + + # Analyze cannot be done inside a transaction + if analyze_table: + sql_script.append(dest.analyze_script()) + + super(CreateUpdateSqlStep, self).__init__( + sql_script=sql_script, wrap_transaction=False, **kwargs) diff --git a/dataduct/steps/pipeline_dependencies.py b/dataduct/steps/pipeline_dependencies.py index b0dd57b..1f6303c 100644 --- a/dataduct/steps/pipeline_dependencies.py +++ b/dataduct/steps/pipeline_dependencies.py @@ -67,6 +67,7 @@ def __init__(self, script=script, command=command, script_arguments=script_arguments, + no_output=True, **kwargs) self._output = None diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 05bf764..5c64266 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -9,6 +9,9 @@ from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError +import logging +logger = logging.getLogger(__name__) + class SqlCommandStep(ETLStep): """SQL Command Step class that helps run scripts on resouces @@ -19,6 +22,7 @@ def __init__(self, script=None, script_arguments=None, queue=None, + sql_script=None, command=None, wrap_transaction=True, **kwargs): @@ -32,15 +36,18 @@ def __init__(self, redshift_database(RedshiftDatabase): database to excute the query **kwargs(optional): Keyword arguments directly passed to base class """ - if not exactly_one(command, script): + if not exactly_one(command, script, sql_script): raise ETLInputError('Both command or script found') + if not isinstance(sql_script, SqlScript): + raise ETLInputError('sql_script should be of the type SqlScript') + super(SqlCommandStep, self).__init__(**kwargs) # Create S3File with script / command provided if script: sql_script = SqlScript(filename=parse_path(script)) - else: + elif command: sql_script = SqlScript(command) if wrap_transaction: @@ -48,6 +55,9 @@ def __init__(self, script = self.create_script(S3File(text=sql_script.sql())) + logger.debug('Sql Query:') + logger.debug(sql_script) + self.create_pipeline_object( object_class=SqlActivity, max_retries=self.max_retries, @@ -72,5 +82,4 @@ def arguments_processor(cls, etl, input_args): step_args = cls.base_arguments_processor(etl, input_args) step_args['redshift_database'] = etl.redshift_database step_args['resource'] = etl.ec2_resource - return step_args diff --git a/dataduct/steps/upsert.py b/dataduct/steps/upsert.py index c921553..278ed8d 100644 --- a/dataduct/steps/upsert.py +++ b/dataduct/steps/upsert.py @@ -17,7 +17,8 @@ class UpsertStep(ETLStep): def __init__(self, destination, redshift_database, sql=None, script=None, source=None, enforce_primary_key=True, - delete_existing=False, history=None, **kwargs): + delete_existing=False, history=None, + analyze_table=True, **kwargs): """Constructor for the UpsertStep class Args: @@ -36,9 +37,14 @@ def __init__(self, destination, redshift_database, sql=None, # Create the destination table if doesn't exist script = dest.exists_clone_script() + script.append(dest.grant_script()) script.append(dest.upsert_script( source_relation, enforce_primary_key, delete_existing)) + # Analyze the destination table after the load + if analyze_table: + script.append(dest.analyze_script()) + if history: hist = HistoryTable(SqlScript( filename=parse_path(history))) diff --git a/examples/example_create_update_sql.yaml b/examples/example_create_update_sql.yaml new file mode 100644 index 0000000..e4ae06b --- /dev/null +++ b/examples/example_create_update_sql.yaml @@ -0,0 +1,13 @@ +name: example_create_update_sql +frequency: one-time +load_time: 01:00 # Hour:Min in UTC + +description: Example for the create-update-sql step + +steps: +- step_type: create-update-sql + command: | + DELETE FROM dev.test_table WHERE id < 0; + INSERT INTO dev.test_table + SELECT * FROM dev.test_table_2; + table_definition: tables/dev.test_table.sql From c5aff4df555d7fde8f814adc23c7ddaadb5ef46e Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Sun, 8 Mar 2015 15:06:48 -0700 Subject: [PATCH 157/175] single dependency on table --- dataduct/steps/extract_rds.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/steps/extract_rds.py b/dataduct/steps/extract_rds.py index c33d023..87b4497 100644 --- a/dataduct/steps/extract_rds.py +++ b/dataduct/steps/extract_rds.py @@ -46,7 +46,7 @@ def __init__(self, if table: sql = 'SELECT * FROM %s;' % table elif sql: - table = SelectStatement(sql).dependencies + table = SelectStatement(sql).dependencies[0] else: raise ETLInputError('Provide a sql statement or a table name') From c3654e19498721e178d768e7fe20111955d78e82 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 9 Mar 2015 00:10:55 -0700 Subject: [PATCH 158/175] Command line argument formatting --- bin/dataduct | 137 ++------------------------------------- dataduct/s3/utils.py | 32 +++++++++ dataduct/utils/cli.py | 146 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 182 insertions(+), 133 deletions(-) create mode 100644 dataduct/utils/cli.py diff --git a/bin/dataduct b/bin/dataduct index 329e688..a156915 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -3,14 +3,11 @@ """Script that helps create and validate pipelines from command line """ -import argparse from argparse import ArgumentParser -from argparse import RawTextHelpFormatter from pytimeparse import parse from datetime import timedelta -from dataduct.config import Config -from dataduct.config import logger_configuration +from dataduct.utils.cli import * # noqa import logging logger = logging.getLogger(__name__) @@ -105,124 +102,9 @@ def database_actions(action, table_definitions, filename=None, **kwargs): print script -class _HelpAction(argparse._HelpAction): - """HelpAction class used to render a custom help message - """ - def __call__(self, parser, namespace, values, option_string=None): - parser.print_help() - print '' - # retrieve subparsers from parser - subparsers_actions = [ - action for action in parser._actions - if isinstance(action, argparse._SubParsersAction)] - - for subparsers_action in subparsers_actions: - # get all subparsers and print help - for choice, subparser in subparsers_action.choices.items(): - print "Command '{}'".format(choice) - print subparser.format_usage() - parser.exit() - - def main(): - """Main function + """Main function that parses the command line arguments """ - formatter_class = lambda prog: RawTextHelpFormatter( - prog, max_help_position=50) - - # Help parser for parsing subparsers in help - help_parser = ArgumentParser( - description='Run Dataduct commands', - add_help=False, - formatter_class=formatter_class, - ) - help_parser.add_argument( - '-h', - '--help', - action=_HelpAction, - help='Help message', - ) - - # Mode parser shared across all pipeline subparsers - mode_help = 'Mode to run the pipeline and config overrides to use' - mode_parser = ArgumentParser( - description=mode_help, - add_help=False, - ) - mode_parser.add_argument( - '-m', - '--mode', - default=None, - help=mode_help - ) - - # Options parser shared actions all pipeline run options - pipeline_run_options = ArgumentParser( - description='Specify actions related to running the pipelines', - add_help=False - ) - pipeline_run_options.add_argument( - '-f', - '--force', - action='store_true', - default=False, - help='Indicates that if this pipeline exists, it will be destroyed', - ) - pipeline_run_options.add_argument( - '-t', - '--time_delta', - default='0h', - help='Timedelta the pipeline by x time difference', - ) - pipeline_run_options.add_argument( - '-b', - '--backfill', - action='store_true', - default=False, - help='Indicates that the timedelta supplied is for a backfill', - ) - pipeline_run_options.add_argument( - '--frequency', - default=None, - help='Frequency override to the pipeline', - ) - - # Pipeline definitions parser - pipeline_definition_help = 'Paths of the pipeline definitions' - pipeline_definition_parser = ArgumentParser( - description=pipeline_definition_help, - add_help=False, - ) - pipeline_definition_parser.add_argument( - 'pipeline_definitions', - nargs='+', - help=pipeline_definition_help, - ) - - # Table definitions parser - table_definition_help = 'Paths of the table definitions' - table_definition_parser = ArgumentParser( - description=table_definition_help, - add_help=False, - ) - table_definition_parser.add_argument( - 'table_definitions', - nargs='+', - help=table_definition_help, - ) - - # Filepath input parser - filepath_help = 'filepath input for storing output of actions' - file_parser = ArgumentParser( - description=filepath_help, - add_help=False, - ) - file_parser.add_argument( - dest='filename', - help='Filename to store output of commands', - ) - - # Main parser parser = ArgumentParser( description='Run Dataduct commands', add_help=False, @@ -392,18 +274,7 @@ def main(): except ImportError: pass args = parser.parse_args() - mode = args.mode - - # To instantiate the singleton object with the correct state - # As this is the single entry point to the library - # We can use the __new__ function to set the debug_level - config = Config(mode=mode) - - # Setup up logging for package - logger_configuration() - - if mode is not None: - logger.warning('Running the pipeline in %s mode.' % config.mode) + config = config_singleton_setup(args) # Frequency override if hasattr(args, 'frequency') and args.frequency is not None: @@ -422,7 +293,7 @@ def main(): elif args.command == DATABASE: database_actions(**arg_vars) else: - raise ValueError('Unknown argument provided, use dataduct') + raise ValueError('Unknown argument provided, use dataduct -h') if __name__ == '__main__': diff --git a/dataduct/s3/utils.py b/dataduct/s3/utils.py index 712cffc..b645bc1 100644 --- a/dataduct/s3/utils.py +++ b/dataduct/s3/utils.py @@ -158,3 +158,35 @@ def delete_dir_from_s3(s3_path): keys = bucket.get_all_keys(prefix=s3_path.key) for key in keys: key.delete() + + +def copy_dir_with_s3(s3_old_path, s3_new_path, raise_when_no_exist=True): + """Copies files from one S3 Path to another + + Args: + s3_old_path(S3Path): Output path of the file to be uploaded + s3_new_path(S3Path): Output path of the file to be uploaded + raise_when_no_exist(bool, optional): Raise error if file not found + + Raises: + ETLInputError: If s3_old_path does not exist + """ + assert isinstance(s3_old_path, S3Path), 'old path should be of type S3Path' + assert s3_old_path.is_directory, 'S3 old path must be directory' + assert isinstance(s3_new_path, S3Path), 'new path should be of type S3Path' + assert s3_new_path.is_directory, 'S3 new path must be directory' + + bucket = get_s3_bucket(s3_old_path.bucket) + prefix = s3_old_path.key + + # Enforce this to be a folder's prefix + if not prefix.endswith('/'): + prefix += '/' + keys = bucket.get_all_keys(prefix=s3_old_path.key) + for key in keys: + if key: + key.copy(s3_new_path.bucket, + os.path.join(s3_new_path.key, os.path.basename(key.key))) + + if raise_when_no_exist and not key: + raise ETLInputError('The key does not exist: %s' % s3_old_path.uri) diff --git a/dataduct/utils/cli.py b/dataduct/utils/cli.py new file mode 100644 index 0000000..45bca50 --- /dev/null +++ b/dataduct/utils/cli.py @@ -0,0 +1,146 @@ +"""Helper function for CLI scripts +""" +from argparse import ArgumentParser +from argparse import RawTextHelpFormatter +import argparse + + +def config_singleton_setup(args): + """Setup the config singleton based on the mode in args + """ + mode = args.mode if hasattr(args, 'mode') else None + import logging + logger = logging.getLogger(__name__) + + from dataduct.config import Config + from dataduct.config import logger_configuration + + + # To instantiate the singleton object with the correct state + # As this is the single entry point to the library + # We can use the __new__ function to set the debug_level + config = Config(mode=mode) + + # Setup up logging for package + logger_configuration() + + if mode is not None: + logger.warning('Running in %s mode', config.mode) + return config + + +# Override help action for better help output +class DataductHelpAction(argparse._HelpAction): + """HelpAction class used to render a custom help message + """ + def __call__(self, parser, namespace, values, option_string=None): + parser.print_help() + print '' + # retrieve subparsers from parser + subparsers_actions = [ + action for action in parser._actions + if isinstance(action, argparse._SubParsersAction)] + + for subparsers_action in subparsers_actions: + # get all subparsers and print help + for choice, subparser in subparsers_action.choices.items(): + print "Command '{}'".format(choice) + print subparser.format_usage() + parser.exit() + + +# Change the width of the output format +formatter_class = lambda prog: RawTextHelpFormatter(prog, max_help_position=50) + + +# Help parser for parsing subparsers in help +help_parser = ArgumentParser( + description='Run Dataduct commands', + add_help=False, + formatter_class=formatter_class, +) +help_parser.add_argument( + '-h', + '--help', + action=DataductHelpAction, + help='Help message', +) + +# Mode parser shared across all pipeline subparsers +mode_help = 'Mode or config overrides to use for the commands' +mode_parser = ArgumentParser( + description=mode_help, + add_help=False, +) +mode_parser.add_argument( + '-m', + '--mode', + default=None, + help=mode_help +) + +# Options parser shared actions all pipeline run options +pipeline_run_options = ArgumentParser( + description='Specify actions related to running the pipelines', + add_help=False +) +pipeline_run_options.add_argument( + '-f', + '--force', + action='store_true', + default=False, + help='Indicates that if this pipeline exists, it will be destroyed', +) +pipeline_run_options.add_argument( + '-t', + '--time_delta', + default='0h', + help='Timedelta the pipeline by x time difference', +) +pipeline_run_options.add_argument( + '-b', + '--backfill', + action='store_true', + default=False, + help='Indicates that the timedelta supplied is for a backfill', +) +pipeline_run_options.add_argument( + '--frequency', + default=None, + help='Frequency override to the pipeline', +) + +# Pipeline definitions parser +pipeline_definition_help = 'Paths of the pipeline definitions' +pipeline_definition_parser = ArgumentParser( + description=pipeline_definition_help, + add_help=False, +) +pipeline_definition_parser.add_argument( + 'pipeline_definitions', + nargs='+', + help=pipeline_definition_help, +) + +# Table definitions parser +table_definition_help = 'Paths of the table definitions' +table_definition_parser = ArgumentParser( + description=table_definition_help, + add_help=False, +) +table_definition_parser.add_argument( + 'table_definitions', + nargs='+', + help=table_definition_help, +) + +# Filepath input parser +filepath_help = 'filepath input for storing output of actions' +file_parser = ArgumentParser( + description=filepath_help, + add_help=False, +) +file_parser.add_argument( + dest='filename', + help='Filename to store output of commands', +) From 790159e28ace735e6bb1b67cdd84485e174af74c Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 9 Mar 2015 12:02:09 -0700 Subject: [PATCH 159/175] PR Comments --- dataduct/s3/s3_directory.py | 8 +++-- dataduct/s3/s3_file.py | 3 +- dataduct/s3/utils.py | 64 ++++++++++++++++++++++++------------- dataduct/utils/cli.py | 22 +++++++++---- 4 files changed, 66 insertions(+), 31 deletions(-) diff --git a/dataduct/s3/s3_directory.py b/dataduct/s3/s3_directory.py index e0c845c..1eb1f50 100644 --- a/dataduct/s3/s3_directory.py +++ b/dataduct/s3/s3_directory.py @@ -4,6 +4,7 @@ from .s3_path import S3Path from .utils import upload_dir_to_s3 from ..utils.helpers import parse_path +from ..utils.exceptions import ETLInputError class S3Directory(object): @@ -38,8 +39,11 @@ def s3_path(self, value): Args: value(S3Path): s3path of the directory """ - assert isinstance(value, S3Path), 'input path must be of type S3Path' - assert value.is_directory, 'input path must be a directory' + if not isinstance(value, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not value.is_directory: + raise ETLInputError('S3 path must be directory') self._s3_path = value def upload_to_s3(self): diff --git a/dataduct/s3/s3_file.py b/dataduct/s3/s3_file.py index 9076ac1..4654a7c 100644 --- a/dataduct/s3/s3_file.py +++ b/dataduct/s3/s3_file.py @@ -93,7 +93,8 @@ def s3_path(self, s3_path): If there is no path, the name "file" will be applied. """ - assert isinstance(s3_path, S3Path), 'input path must be of type S3Path' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') # Copy the object as we would change it for the file self._s3_path = S3Path( diff --git a/dataduct/s3/utils.py b/dataduct/s3/utils.py index b645bc1..33937d4 100644 --- a/dataduct/s3/utils.py +++ b/dataduct/s3/utils.py @@ -31,7 +31,8 @@ def read_from_s3(s3_path): Returns: results(str): Contents of the file as a string """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') bucket = get_s3_bucket(s3_path.bucket) key = bucket.get_key(s3_path.key) @@ -47,8 +48,11 @@ def upload_to_s3(s3_path, file_name=None, file_text=None): file_name(str): Name of the file to be uploaded to s3 file_text(str): Contents of the file to be uploaded """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert any([file_name, file_text]), 'file_name or text should be given' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not any([file_name, file_text]): + raise ETLInputError('File_name or text should be given') bucket = get_s3_bucket(s3_path.bucket) if s3_path.is_directory: @@ -78,8 +82,7 @@ def copy_within_s3(s3_old_path, s3_new_path, raise_when_no_exist=True): key = bucket.get_key(s3_old_path.key) if key: key.copy(s3_new_path.bucket, s3_new_path.key) - - if raise_when_no_exist and not key: + elif raise_when_no_exist: raise ETLInputError('The key does not exist: %s' % s3_old_path.uri) @@ -91,9 +94,14 @@ def upload_dir_to_s3(s3_path, local_path, filter_function=None): local_path(file_path): Input path of the file to be uploaded filter_function(function): Function to filter out directories """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert s3_path.is_directory, 'S3 path must be directory' - assert os.path.isdir(local_path), 'Local path must be a directory' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not s3_path.is_directory: + raise ETLInputError('S3 path must be directory') + + if not os.path.isdir(local_path): + raise ETLInputError('Local path must be a directory') bucket = get_s3_bucket(s3_path.bucket) @@ -119,8 +127,11 @@ def download_dir_from_s3(s3_path, local_path): s3_path(S3Path): Input path of the file to be downloaded local_path(file_path): Output path of the file to be downloaded """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert s3_path.is_directory, 'S3 path must be directory' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not s3_path.is_directory: + raise ETLInputError('S3 path must be directory') bucket = get_s3_bucket(s3_path.bucket) keys = bucket.get_all_keys(prefix=s3_path.key + '/') @@ -146,15 +157,18 @@ def delete_dir_from_s3(s3_path): Args: s3_path(S3Path): Path of the directory to be deleted """ - assert isinstance(s3_path, S3Path), 'input path should be of type S3Path' - assert s3_path.is_directory, 'S3 path must be directory' + if not isinstance(s3_path, S3Path): + raise ETLInputError('Input path should be of type S3Path') + + if not s3_path.is_directory: + raise ETLInputError('S3 path must be directory') bucket = get_s3_bucket(s3_path.bucket) prefix = s3_path.key # Enforce this to be a folder's prefix - if not prefix.endswith('/'): - prefix += '/' + prefix += '/' if not prefix.endswith('/') else '' + keys = bucket.get_all_keys(prefix=s3_path.key) for key in keys: key.delete() @@ -171,22 +185,28 @@ def copy_dir_with_s3(s3_old_path, s3_new_path, raise_when_no_exist=True): Raises: ETLInputError: If s3_old_path does not exist """ - assert isinstance(s3_old_path, S3Path), 'old path should be of type S3Path' - assert s3_old_path.is_directory, 'S3 old path must be directory' - assert isinstance(s3_new_path, S3Path), 'new path should be of type S3Path' - assert s3_new_path.is_directory, 'S3 new path must be directory' + if not isinstance(s3_old_path, S3Path): + raise ETLInputError('S3 old path should be of type S3Path') + + if not s3_old_path.is_directory: + raise ETLInputError('S3 old path must be directory') + + if not isinstance(s3_new_path, S3Path): + raise ETLInputError('S3 new path should be of type S3Path') + + if not s3_new_path.is_directory: + raise ETLInputError('S3 new path must be directory') bucket = get_s3_bucket(s3_old_path.bucket) prefix = s3_old_path.key # Enforce this to be a folder's prefix - if not prefix.endswith('/'): - prefix += '/' + prefix += '/' if not prefix.endswith('/') else '' + keys = bucket.get_all_keys(prefix=s3_old_path.key) for key in keys: if key: key.copy(s3_new_path.bucket, os.path.join(s3_new_path.key, os.path.basename(key.key))) - - if raise_when_no_exist and not key: + elif raise_when_no_exist: raise ETLInputError('The key does not exist: %s' % s3_old_path.uri) diff --git a/dataduct/utils/cli.py b/dataduct/utils/cli.py index 45bca50..9c22ea2 100644 --- a/dataduct/utils/cli.py +++ b/dataduct/utils/cli.py @@ -7,15 +7,24 @@ def config_singleton_setup(args): """Setup the config singleton based on the mode in args + + Note: + To instantiate the singleton object with the correct state as this is + the single entry point to the library. We can use the __new__ function + to set the debug_level + + We import inside the function as the singleton declaration should be + done here and at no other entry point. The same pattern is followed + at all the entry point scripts. """ mode = args.mode if hasattr(args, 'mode') else None + import logging logger = logging.getLogger(__name__) from dataduct.config import Config from dataduct.config import logger_configuration - # To instantiate the singleton object with the correct state # As this is the single entry point to the library # We can use the __new__ function to set the debug_level @@ -36,7 +45,8 @@ class DataductHelpAction(argparse._HelpAction): def __call__(self, parser, namespace, values, option_string=None): parser.print_help() print '' - # retrieve subparsers from parser + + # Retrieve subparsers from parser subparsers_actions = [ action for action in parser._actions if isinstance(action, argparse._SubParsersAction)] @@ -81,7 +91,7 @@ def __call__(self, parser, namespace, values, option_string=None): # Options parser shared actions all pipeline run options pipeline_run_options = ArgumentParser( - description='Specify actions related to running the pipelines', + description='Specify actions related to running pipelines', add_help=False ) pipeline_run_options.add_argument( @@ -89,7 +99,7 @@ def __call__(self, parser, namespace, values, option_string=None): '--force', action='store_true', default=False, - help='Indicates that if this pipeline exists, it will be destroyed', + help='Destroy previous versions of this pipeline, if they exist', ) pipeline_run_options.add_argument( '-t', @@ -107,7 +117,7 @@ def __call__(self, parser, namespace, values, option_string=None): pipeline_run_options.add_argument( '--frequency', default=None, - help='Frequency override to the pipeline', + help='Frequency override for the pipeline', ) # Pipeline definitions parser @@ -135,7 +145,7 @@ def __call__(self, parser, namespace, values, option_string=None): ) # Filepath input parser -filepath_help = 'filepath input for storing output of actions' +filepath_help = 'Filepath input for storing output of actions' file_parser = ArgumentParser( description=filepath_help, add_help=False, From 4b1b331369f56d59bf266272a104584967dc380d Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 9 Mar 2015 13:13:52 -0700 Subject: [PATCH 160/175] PR comments 2 --- dataduct/utils/cli.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/dataduct/utils/cli.py b/dataduct/utils/cli.py index 9c22ea2..f542fb2 100644 --- a/dataduct/utils/cli.py +++ b/dataduct/utils/cli.py @@ -25,9 +25,6 @@ def config_singleton_setup(args): from dataduct.config import Config from dataduct.config import logger_configuration - # To instantiate the singleton object with the correct state - # As this is the single entry point to the library - # We can use the __new__ function to set the debug_level config = Config(mode=mode) # Setup up logging for package @@ -38,7 +35,6 @@ def config_singleton_setup(args): return config -# Override help action for better help output class DataductHelpAction(argparse._HelpAction): """HelpAction class used to render a custom help message """ From 71d65de997ca2d961feaa297f5282a96b188dad7 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 9 Mar 2015 13:17:16 -0700 Subject: [PATCH 161/175] PR comments --- dataduct/steps/create_update_sql.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/dataduct/steps/create_update_sql.py b/dataduct/steps/create_update_sql.py index 1d7751d..9324a1d 100644 --- a/dataduct/steps/create_update_sql.py +++ b/dataduct/steps/create_update_sql.py @@ -1,5 +1,4 @@ -""" -ETL step wrapper for sql command for inserting into tables +"""ETL step wrapper for sql command for inserting into tables """ from .sql_command import SqlCommandStep from ..database import SqlScript @@ -27,7 +26,7 @@ def __init__(self, **kwargs(optional): Keyword arguments directly passed to base class """ if not exactly_one(command, script): - raise ETLInputError('Both command or script found') + raise ETLInputError('Both command and script found') # Create S3File with script / command provided if script: From 7c1e1e324d4162e3d431304a40a54471deb5262d Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Mon, 9 Mar 2015 16:28:18 -0700 Subject: [PATCH 162/175] INT keyword --- dataduct/database/parsers/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dataduct/database/parsers/utils.py b/dataduct/database/parsers/utils.py index 7a66635..67d2044 100644 --- a/dataduct/database/parsers/utils.py +++ b/dataduct/database/parsers/utils.py @@ -12,6 +12,7 @@ # Data types _smallint = CaselessKeyword('SMALLINT') +_int = CaselessKeyword('INT') _integer = CaselessKeyword('INTEGER') _bigint = CaselessKeyword('BIGINT') _decimal = Combine(CaselessKeyword('DECIMAL') + '(' + Word(nums + ',') + ')') @@ -58,7 +59,7 @@ # Column types column_types = _smallint | _integer | _bigint | _decimal | _real | _double -column_types |= _boolean | _char | _varchar | _date | _timestamp +column_types |= _boolean | _char | _varchar | _date | _timestamp | _int # Define a field parser for create table fields or select query fields field_parser = Forward() From 32980eb57d94fc6f312e571585234a0893fecd7f Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 10 Mar 2015 11:59:23 -0700 Subject: [PATCH 163/175] path fixes --- dataduct/config/example_config | 2 +- dataduct/config/logger_config.py | 2 +- dataduct/steps/extract_s3.py | 3 +++ dataduct/steps/transform.py | 3 ++- dataduct/utils/helpers.py | 14 ++++++++++++-- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/dataduct/config/example_config b/dataduct/config/example_config index 819ad05..5028212 100644 --- a/dataduct/config/example_config +++ b/dataduct/config/example_config @@ -9,7 +9,7 @@ emr: MASTER_INSTANCE_TYPE: m1.large NUM_CORE_INSTANCES: 1 CORE_INSTANCE_TYPE: m1.large - CLUSTER_AMI: 2.4.7 + CLUSTER_AMI: 3.1.0 etl: S3_ETL_BUCKET: FILL_ME_IN diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py index d2405e9..30f7b59 100644 --- a/dataduct/config/logger_config.py +++ b/dataduct/config/logger_config.py @@ -20,7 +20,7 @@ def logger_configuration(): if hasattr(config, 'logging'): log_directory = config.logging.get( - 'LOG_DIR', os.path.join(os.path.expanduser(CONFIG_DIR))) + 'LOG_DIR', os.path.expanduser('~' + CONFIG_DIR)) file_name = config.logging.get( 'LOG_FILE', LOG_FILE) diff --git a/dataduct/steps/extract_s3.py b/dataduct/steps/extract_s3.py index de7850c..dbb9477 100644 --- a/dataduct/steps/extract_s3.py +++ b/dataduct/steps/extract_s3.py @@ -5,6 +5,7 @@ from ..s3 import S3Path from ..utils.helpers import exactly_one from ..utils.exceptions import ETLInputError +from ..utils.helpers import get_modified_s3_path class ExtractS3Step(ETLStep): @@ -25,8 +26,10 @@ def __init__(self, directory_uri=None, file_uri=None, **kwargs): super(ExtractS3Step, self).__init__(**kwargs) if directory_uri: + directory_uri = get_modified_s3_path(directory_uri) s3_path = S3Path(uri=directory_uri, is_directory=True) else: + file_uri = get_modified_s3_path(file_uri) s3_path = S3Path(uri=file_uri) self._output = self.create_s3_data_node(s3_path) diff --git a/dataduct/steps/transform.py b/dataduct/steps/transform.py index f34f8b4..c8e7c18 100644 --- a/dataduct/steps/transform.py +++ b/dataduct/steps/transform.py @@ -9,6 +9,7 @@ from ..s3 import S3File from ..s3 import S3Directory from ..utils.helpers import exactly_one +from ..utils.helpers import get_modified_s3_path from ..utils.exceptions import ETLInputError from ..utils import constants as const @@ -56,7 +57,7 @@ def __init__(self, if not no_output: # Create output_node based on output_path base_output_node = self.create_s3_data_node( - self.get_output_s3_path(output_path)) + self.get_output_s3_path(get_modified_s3_path(output_path))) script_arguments = self.translate_arguments(script_arguments) diff --git a/dataduct/utils/helpers.py b/dataduct/utils/helpers.py index 257abbf..c128ca6 100644 --- a/dataduct/utils/helpers.py +++ b/dataduct/utils/helpers.py @@ -130,10 +130,12 @@ def parse_path(path, path_type=RESOURCE_BASE_PATH): config = Config() if path_type == RESOURCE_BASE_PATH: if RESOURCE_BASE_PATH in config.etl: - return os.path.join(config.etl[RESOURCE_BASE_PATH], path) + return os.path.join( + os.path.expanduser(config.etl[RESOURCE_BASE_PATH]), path) else: if CUSTOM_STEPS_PATH in config.etl: - return os.path.join(config.etl[CUSTOM_STEPS_PATH], path) + return os.path.join( + os.path.expanduser(config.etl[CUSTOM_STEPS_PATH]), path) # Return the path as is. return path @@ -145,3 +147,11 @@ def get_s3_base_path(): config = Config() return os.path.join('s3://', config.etl.get('S3_ETL_BUCKET', ''), config.etl.get('S3_BASE_PATH', '')) + +def get_modified_s3_path(path): + """Modify the s3 path to replace S3_BASE_PATH with config parameter + """ + config = Config() + if path is None: + return None + return path.replace('{S3_BASE_PATH}', config.etl.get('S3_BASE_PATH')) From 89b499278ba90ddd3b405fc38c6d95c4e9b3f85c Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Tue, 10 Mar 2015 14:56:11 -0700 Subject: [PATCH 164/175] Do not check the type of sqlscript if it is None --- dataduct/steps/sql_command.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataduct/steps/sql_command.py b/dataduct/steps/sql_command.py index 5c64266..7e8d918 100644 --- a/dataduct/steps/sql_command.py +++ b/dataduct/steps/sql_command.py @@ -37,9 +37,9 @@ def __init__(self, **kwargs(optional): Keyword arguments directly passed to base class """ if not exactly_one(command, script, sql_script): - raise ETLInputError('Both command or script found') + raise ETLInputError('Both command and script found') - if not isinstance(sql_script, SqlScript): + if sql_script is not None and not isinstance(sql_script, SqlScript): raise ETLInputError('sql_script should be of the type SqlScript') super(SqlCommandStep, self).__init__(**kwargs) From f04dc28e40b855d3c93c4e78a56926452f955d53 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 11 Mar 2015 12:01:22 -0700 Subject: [PATCH 165/175] Change placement of expanduser so that you can write '~' in config file --- dataduct/config/logger_config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dataduct/config/logger_config.py b/dataduct/config/logger_config.py index 30f7b59..af739fc 100644 --- a/dataduct/config/logger_config.py +++ b/dataduct/config/logger_config.py @@ -19,8 +19,8 @@ def logger_configuration(): config = Config() if hasattr(config, 'logging'): - log_directory = config.logging.get( - 'LOG_DIR', os.path.expanduser('~' + CONFIG_DIR)) + log_directory = os.path.expanduser(config.logging.get( + 'LOG_DIR', '~' + CONFIG_DIR)) file_name = config.logging.get( 'LOG_FILE', LOG_FILE) From 774525e15879251e4ca01ab03ecd295bbdc25a53 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Wed, 11 Mar 2015 13:47:43 -0700 Subject: [PATCH 166/175] Move upsert to shell command activity --- dataduct/database/table.py | 12 +++++ dataduct/database/view.py | 12 +++++ dataduct/steps/create_load_redshift.py | 5 +- dataduct/steps/create_update_sql.py | 37 ++++++++++---- .../scripts/create_load_redshift_runner.py | 11 ++-- dataduct/steps/scripts/sql_runner.py | 47 +++++++++++++++++ dataduct/steps/upsert.py | 50 ++++--------------- dataduct/utils/constants.py | 2 + 8 files changed, 119 insertions(+), 57 deletions(-) create mode 100644 dataduct/steps/scripts/sql_runner.py diff --git a/dataduct/database/table.py b/dataduct/database/table.py index c95c4b5..34184ae 100644 --- a/dataduct/database/table.py +++ b/dataduct/database/table.py @@ -300,3 +300,15 @@ def upsert_script(self, source_relation, enforce_primary_key=True, script.append(self.insert_script(temp_table)) script.append(temp_table.drop_script()) return script + + def check_not_exists_script(self): + """Sql script to create statement if the table exists or not + """ + return SqlScript(""" + SELECT NOT EXISTS( + SELECT 1 + FROM information_schema.tables + WHERE table_schema = '%s' + AND table_name = '%s' + ) + """ % (self.schema_name, self.table_name)) diff --git a/dataduct/database/view.py b/dataduct/database/view.py index e90a9ac..435fe72 100644 --- a/dataduct/database/view.py +++ b/dataduct/database/view.py @@ -46,3 +46,15 @@ def drop_script(self): """Sql script to drop the view """ return SqlScript('DROP VIEW IF EXISTS %s CASCADE' % self.full_name) + + def check_not_exists_script(self): + """Sql script to create statement if the table exists or not + """ + return SqlScript(""" + SELECT NOT EXISTS( + SELECT 1 + FROM information_schema.views + WHERE table_schema = '%s' + AND table_name = '%s' + ) + """ % (self.schema_name, self.view_name)) diff --git a/dataduct/steps/create_load_redshift.py b/dataduct/steps/create_load_redshift.py index 3c6751f..21023a6 100644 --- a/dataduct/steps/create_load_redshift.py +++ b/dataduct/steps/create_load_redshift.py @@ -28,8 +28,7 @@ def __init__(self, id, table_definition, input_node=None, with open(parse_path(table_definition)) as f: table_def_string = f.read() - table_exists_script = Table( - SqlStatement(table_def_string)).exists_clone_script() + table = Table(SqlStatement(table_def_string)) if isinstance(input_node, dict): input_paths = [i.path().uri for i in input_node.values()] @@ -41,7 +40,7 @@ def __init__(self, id, table_definition, input_node=None, script_arguments = list() script_arguments.extend([ - '--table_definition=%s' % table_exists_script.sql(), + '--table_definition=%s' % table.sql().sql(), '--s3_input_paths'] + input_paths) steps_path = os.path.abspath(os.path.dirname(__file__)) diff --git a/dataduct/steps/create_update_sql.py b/dataduct/steps/create_update_sql.py index 9324a1d..9c069b4 100644 --- a/dataduct/steps/create_update_sql.py +++ b/dataduct/steps/create_update_sql.py @@ -1,14 +1,16 @@ """ETL step wrapper for sql command for inserting into tables """ -from .sql_command import SqlCommandStep +import os +from .transform import TransformStep from ..database import SqlScript from ..database import Table +from ..utils import constants as const from ..utils.helpers import exactly_one from ..utils.helpers import parse_path from ..utils.exceptions import ETLInputError -class CreateUpdateSqlStep(SqlCommandStep): +class CreateUpdateSqlStep(TransformStep): """Create and Insert step that creates a table and then uses the query to update the table data with any sql query provided """ @@ -18,7 +20,6 @@ def __init__(self, script=None, command=None, analyze_table=True, - wrap_transaction=True, **kwargs): """Constructor for the CreateUpdateStep class @@ -36,16 +37,30 @@ def __init__(self, dest = Table(SqlScript(filename=parse_path(table_definition))) - sql_script = dest.exists_clone_script() - sql_script.append(dest.grant_script()) - sql_script.append(update_script) + steps_path = os.path.abspath(os.path.dirname(__file__)) + runner_script = os.path.join(steps_path, const.SQL_RUNNER_SCRIPT_PATH) - if wrap_transaction: - sql_script = sql_script.wrap_transaction() + script_arguments = [ + '--table_definition=%s' % dest.sql().sql(), + '--sql=%s' % update_script.sql() + ] - # Analyze cannot be done inside a transaction if analyze_table: - sql_script.append(dest.analyze_script()) + script_arguments.append('--analyze') super(CreateUpdateSqlStep, self).__init__( - sql_script=sql_script, wrap_transaction=False, **kwargs) + script=runner_script, script_arguments=script_arguments, + no_output=True, **kwargs) + + @classmethod + def arguments_processor(cls, etl, input_args): + """Parse the step arguments according to the ETL pipeline + + Args: + etl(ETLPipeline): Pipeline object containing resources and steps + step_args(dict): Dictionary of the step arguments for the class + """ + step_args = cls.base_arguments_processor(etl, input_args) + cls.pop_inputs(step_args) + step_args['resource'] = etl.ec2_resource + return step_args diff --git a/dataduct/steps/scripts/create_load_redshift_runner.py b/dataduct/steps/scripts/create_load_redshift_runner.py index c232e02..7c1ab1a 100644 --- a/dataduct/steps/scripts/create_load_redshift_runner.py +++ b/dataduct/steps/scripts/create_load_redshift_runner.py @@ -4,6 +4,7 @@ """ import argparse +import pandas.io.sql as pdsql from dataduct.config import get_aws_credentials from dataduct.data_access import redshift_connection from dataduct.database import SqlStatement @@ -72,13 +73,15 @@ def main(): args = parser.parse_args() print args - connection = redshift_connection() - cursor = connection.cursor() - table = Table(SqlStatement(args.table_definition)) + connection = redshift_connection() + table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), + connection).loc[0][0] + cursor = connection.cursor() # Create table in redshift, this is safe due to the if exists condition - cursor.execute(table.create_script().sql()) + if table_not_exists: + cursor.execute(table.create_script().sql()) # Load data into redshift load_query = load_redshift(table, args.input_paths, args.max_error, diff --git a/dataduct/steps/scripts/sql_runner.py b/dataduct/steps/scripts/sql_runner.py new file mode 100644 index 0000000..0b1e847 --- /dev/null +++ b/dataduct/steps/scripts/sql_runner.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +"""Runner for the upsert SQL step +""" + +import argparse +import pandas.io.sql as pdsql +from dataduct.data_access import redshift_connection +from dataduct.database import SqlStatement +from dataduct.database import Table + + +def main(): + """Main Function + """ + parser = argparse.ArgumentParser() + parser.add_argument('--table_definition', dest='table_definition', + required=True) + parser.add_argument('--sql', dest='sql', required=True) + parser.add_argument('--analyze', action='store_true', default=False) + args = parser.parse_args() + print args + + table = Table(SqlStatement(args.table_definition)) + connection = redshift_connection() + table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), + connection).loc[0][0] + + cursor = connection.cursor() + # Create table in redshift, this is safe due to the if exists condition + if table_not_exists: + cursor.execute(table.create_script().sql()) + + # Load data into redshift with upsert query + cursor.execute(args.sql) + cursor.execute('COMMIT') + + # Analyze the table + if args.analyze: + cursor.execute(table.analyze_script().sql()) + + cursor.close() + connection.close() + + +if __name__ == '__main__': + main() diff --git a/dataduct/steps/upsert.py b/dataduct/steps/upsert.py index 278ed8d..902bc5a 100644 --- a/dataduct/steps/upsert.py +++ b/dataduct/steps/upsert.py @@ -1,23 +1,20 @@ """ETL step wrapper for Upsert SQL script """ -from .etl_step import ETLStep -from ..pipeline import SqlActivity +from .create_update_sql import CreateUpdateSqlStep from ..database import Table from ..database import SqlScript from ..database import SelectStatement from ..database import HistoryTable -from ..s3 import S3File from ..utils.helpers import parse_path from ..utils.helpers import exactly_one -class UpsertStep(ETLStep): +class UpsertStep(CreateUpdateSqlStep): """Upsert Step class that helps run a step on the emr cluster """ - def __init__(self, destination, redshift_database, sql=None, - script=None, source=None, enforce_primary_key=True, - delete_existing=False, history=None, + def __init__(self, destination, sql=None, script=None, source=None, + enforce_primary_key=True, delete_existing=False, history=None, analyze_table=True, **kwargs): """Constructor for the UpsertStep class @@ -25,10 +22,10 @@ def __init__(self, destination, redshift_database, sql=None, **kwargs(optional): Keyword arguments directly passed to base class """ assert exactly_one(sql, source, script), 'One of sql/source/script' - super(UpsertStep, self).__init__(**kwargs) # Input formatting dest = Table(SqlScript(filename=parse_path(destination))) + if source is not None: source_relation = Table(SqlScript(filename=parse_path(source))) else: @@ -36,39 +33,14 @@ def __init__(self, destination, redshift_database, sql=None, SqlScript(sql=sql, filename=parse_path(script)).sql()) # Create the destination table if doesn't exist - script = dest.exists_clone_script() - script.append(dest.grant_script()) - script.append(dest.upsert_script( - source_relation, enforce_primary_key, delete_existing)) - - # Analyze the destination table after the load - if analyze_table: - script.append(dest.analyze_script()) + sql_script = dest.upsert_script(source_relation, enforce_primary_key, + delete_existing) if history: hist = HistoryTable(SqlScript( filename=parse_path(history))) - script.append(hist.update_history_script(dest)) - - self.activity = self.create_pipeline_object( - object_class=SqlActivity, - resource=self.resource, - schedule=self.schedule, - depends_on=self.depends_on, - database=redshift_database, - max_retries=self.max_retries, - script=self.create_script(S3File(text=script.sql()))) + sql_script.append(hist.update_history_script(dest)) - @classmethod - def arguments_processor(cls, etl, input_args): - """Parse the step arguments according to the ETL pipeline - - Args: - etl(ETLPipeline): Pipeline object containing resources and steps - step_args(dict): Dictionary of the step arguments for the class - """ - step_args = cls.base_arguments_processor(etl, input_args) - cls.pop_inputs(step_args) - step_args['resource'] = etl.ec2_resource - step_args['redshift_database'] = etl.redshift_database - return step_args + super(UpsertStep, self).__init__( + table_definition=destination, command=sql_script.sql(), + analyze_table=analyze_table, **kwargs) diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index 2fa0667..d0f972c 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -34,3 +34,5 @@ SCRIPTS_DIRECTORY, 'column_check_test.py') CREATE_LOAD_SCRIPT_PATH = os.path.join( SCRIPTS_DIRECTORY, 'create_load_redshift_runner.py') +SQL_RUNNER_SCRIPT_PATH = os.path.join( + SCRIPTS_DIRECTORY, 'sql_runner.py') From cc9699738dfa77997a1addbe1cc4da15e38d1fa2 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Wed, 11 Mar 2015 22:13:54 -0700 Subject: [PATCH 167/175] Add script_arguments for create_update_sql --- dataduct/steps/create_update_sql.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/dataduct/steps/create_update_sql.py b/dataduct/steps/create_update_sql.py index 9c069b4..7b0851c 100644 --- a/dataduct/steps/create_update_sql.py +++ b/dataduct/steps/create_update_sql.py @@ -20,6 +20,7 @@ def __init__(self, script=None, command=None, analyze_table=True, + script_arguments=None, **kwargs): """Constructor for the CreateUpdateStep class @@ -40,10 +41,13 @@ def __init__(self, steps_path = os.path.abspath(os.path.dirname(__file__)) runner_script = os.path.join(steps_path, const.SQL_RUNNER_SCRIPT_PATH) - script_arguments = [ + if script_arguments is None: + script_arguments = list() + + script_arguments.extend([ '--table_definition=%s' % dest.sql().sql(), '--sql=%s' % update_script.sql() - ] + ]) if analyze_table: script_arguments.append('--analyze') From 9d6dd54bd6d9e5736d8e5ed71079952e046ccc33 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 13 Mar 2015 13:19:37 -0700 Subject: [PATCH 168/175] script arguments --- dataduct/steps/create_update_sql.py | 21 ++++++++++++++------- dataduct/steps/scripts/sql_runner.py | 15 ++++++++++++--- 2 files changed, 26 insertions(+), 10 deletions(-) diff --git a/dataduct/steps/create_update_sql.py b/dataduct/steps/create_update_sql.py index 7b0851c..ff0993f 100644 --- a/dataduct/steps/create_update_sql.py +++ b/dataduct/steps/create_update_sql.py @@ -21,6 +21,7 @@ def __init__(self, command=None, analyze_table=True, script_arguments=None, + non_transactional=False, **kwargs): """Constructor for the CreateUpdateStep class @@ -41,19 +42,25 @@ def __init__(self, steps_path = os.path.abspath(os.path.dirname(__file__)) runner_script = os.path.join(steps_path, const.SQL_RUNNER_SCRIPT_PATH) - if script_arguments is None: - script_arguments = list() - - script_arguments.extend([ + arguments = [ '--table_definition=%s' % dest.sql().sql(), '--sql=%s' % update_script.sql() - ]) + ] if analyze_table: - script_arguments.append('--analyze') + arguments.append('--analyze') + + if non_transactional: + arguments.append('--non_transactional') + + if script_arguments is not None: + if not isinstance(script_arguments, list): + raise ETLInputError( + 'Script arguments for SQL steps should be dictionary') + arguments.extend(script_arguments) super(CreateUpdateSqlStep, self).__init__( - script=runner_script, script_arguments=script_arguments, + script=runner_script, script_arguments=arguments, no_output=True, **kwargs) @classmethod diff --git a/dataduct/steps/scripts/sql_runner.py b/dataduct/steps/scripts/sql_runner.py index 0b1e847..4ee93df 100644 --- a/dataduct/steps/scripts/sql_runner.py +++ b/dataduct/steps/scripts/sql_runner.py @@ -18,11 +18,18 @@ def main(): required=True) parser.add_argument('--sql', dest='sql', required=True) parser.add_argument('--analyze', action='store_true', default=False) - args = parser.parse_args() - print args + parser.add_argument('--non_transactional', action='store_true', + default=False) + + args, sql_arguments = parser.parse_known_args() + print args, sql_arguments table = Table(SqlStatement(args.table_definition)) connection = redshift_connection() + # Enable autocommit for non transactional sql execution + if args.non_transactional: + connection.autocommit = True + table_not_exists = pdsql.read_sql(table.check_not_exists_script().sql(), connection).loc[0][0] @@ -32,7 +39,9 @@ def main(): cursor.execute(table.create_script().sql()) # Load data into redshift with upsert query - cursor.execute(args.sql) + sql = args.sql % tuple(sql_arguments) + print 'Running :', sql + cursor.execute(sql) cursor.execute('COMMIT') # Analyze the table From 258abcc7d9d54b83bcd3a48888964cc53100edfc Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Fri, 13 Mar 2015 13:57:45 -0700 Subject: [PATCH 169/175] QA logs --- dataduct/etl/etl_pipeline.py | 56 +++++++++++++++++++------ dataduct/qa/check.py | 8 ++-- dataduct/utils/constants.py | 1 + examples/example_primary_key_check.yaml | 2 +- 4 files changed, 50 insertions(+), 17 deletions(-) diff --git a/dataduct/etl/etl_pipeline.py b/dataduct/etl/etl_pipeline.py index cbda9b8..466d47f 100644 --- a/dataduct/etl/etl_pipeline.py +++ b/dataduct/etl/etl_pipeline.py @@ -39,7 +39,9 @@ S3_BASE_PATH = config.etl.get('S3_BASE_PATH', const.EMPTY_STR) SNS_TOPIC_ARN_FAILURE = config.etl.get('SNS_TOPIC_ARN_FAILURE', const.NONE) NAME_PREFIX = config.etl.get('NAME_PREFIX', const.EMPTY_STR) +QA_LOG_PATH = config.etl.get('QA_LOG_PATH', const.QA_STR) DP_INSTANCE_LOG_PATH = config.etl.get('DP_INSTANCE_LOG_PATH', const.NONE) +DP_PIPELINE_LOG_PATH = config.etl.get('DP_PIPELINE_LOG_PATH', const.NONE) class ETLPipeline(object): @@ -470,7 +472,21 @@ def pipeline_objects(self): return result @staticmethod - def log_s3_dp_instance_data(pipeline): + def log_uploader(uri, filename, string): + """Utility function to upload log files to S3 + """ + dp_dir = S3Path(uri=uri, is_directory=True) + dp_path = S3Path( + key=filename + '.tsv', + parent_dir=dp_dir, + ) + dp_file = S3File( + text=string, + s3_path=dp_path, + ) + dp_file.upload_to_s3() + + def log_s3_dp_instance_data(self, pipeline): """Uploads instance info for dp_instances to S3 """ dp_instance_entries = list_formatted_instance_details(pipeline) @@ -481,22 +497,32 @@ def log_s3_dp_instance_data(pipeline): writer.writerows(dp_instance_entries) # S3 Path computation - uri = os.path.join(get_s3_base_path(), - config.etl.get('DP_INSTANCE_LOG_PATH'), + uri = os.path.join(get_s3_base_path(), QA_LOG_PATH, + DP_INSTANCE_LOG_PATH, datetime.utcnow().strftime('%Y%m%d')) - dp_instances_dir = S3Path(uri=uri, is_directory=True) - dp_instances_path = S3Path( - key=pipeline.id + '.tsv', - parent_dir=dp_instances_dir, - ) - dp_instances_file = S3File( - text=output_string.getvalue(), - s3_path=dp_instances_path, - ) - dp_instances_file.upload_to_s3() + self.log_uploader(uri, pipeline.id, output_string.getvalue()) output_string.close() + def log_s3_dp_pipeline_data(self): + """Uploads instance info for dp_pipeline to S3 + """ + output_string = StringIO() + writer = csv.writer(output_string, delimiter='\t') + writer.writerow([ + self.pipeline.id, + self.name, + self.version_ts + ]) + + # S3 Path computation + uri = os.path.join(get_s3_base_path(), QA_LOG_PATH, + DP_PIPELINE_LOG_PATH, + datetime.utcnow().strftime('%Y%m%d')) + + self.log_uploader(uri, self.pipeline.id, output_string.getvalue()) + output_string.close() + def delete_if_exists(self): """Delete the pipelines with the same name as current pipeline """ @@ -591,5 +617,9 @@ def activate(self): ) pipeline_definition.upload_to_s3() + # Upload pipeline instance metadata to S3 + if DP_PIPELINE_LOG_PATH: + self.log_s3_dp_pipeline_data() + # Activate the pipeline with AWS self.pipeline.activate() diff --git a/dataduct/qa/check.py b/dataduct/qa/check.py index 039e8b4..04fbb84 100644 --- a/dataduct/qa/check.py +++ b/dataduct/qa/check.py @@ -9,6 +9,7 @@ from ..database import SelectStatement from ..s3 import S3Path from ..s3 import S3File +from ..utils import constants as const from ..utils.helpers import exactly_one from ..utils.helpers import get_s3_base_path @@ -144,9 +145,10 @@ def log_output_to_s3(self, destination_sql=None, table=None, string = '\t'.join(map(str, row)) # S3 Path computation - qa_test_dir_uri = os.path.join(get_s3_base_path(), - config.etl.get('QA_LOG_PATH', 'qa'), - path_suffix if path_suffix else '') + qa_test_dir_uri = os.path.join( + get_s3_base_path(), config.etl.get('QA_LOG_PATH', const.QA_STR), + config.etl.get('DP_QA_TESTS_LOG_PATH', 'dba_table_qa_tests'), + path_suffix if path_suffix else '') parent_dir = S3Path(uri=qa_test_dir_uri, is_directory=True) diff --git a/dataduct/utils/constants.py b/dataduct/utils/constants.py index d0f972c..e19bd95 100644 --- a/dataduct/utils/constants.py +++ b/dataduct/utils/constants.py @@ -19,6 +19,7 @@ LOG_STR = 'logs' DATA_STR = 'data' SRC_STR = 'src' +QA_STR = 'qa' # Step paths SCRIPTS_DIRECTORY = 'scripts' diff --git a/examples/example_primary_key_check.yaml b/examples/example_primary_key_check.yaml index 3cf3822..d3a8b14 100644 --- a/examples/example_primary_key_check.yaml +++ b/examples/example_primary_key_check.yaml @@ -9,4 +9,4 @@ steps: table_definition: tables/dev.test_table.sql log_to_s3: true script_arguments: - - --path_suffix=dba_table_qa_tests + - "--path_suffix=#{format(@scheduledStartTime, 'YYYY-MM-dd')}" From 49ce18747122c39535fd6065dcba11f27737d84e Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Tue, 17 Mar 2015 11:26:29 -0700 Subject: [PATCH 170/175] Fix sql injection issue --- dataduct/steps/scripts/sql_runner.py | 5 ++--- examples/example_create_update_sql.yaml | 5 ++++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/dataduct/steps/scripts/sql_runner.py b/dataduct/steps/scripts/sql_runner.py index 4ee93df..e7e579c 100644 --- a/dataduct/steps/scripts/sql_runner.py +++ b/dataduct/steps/scripts/sql_runner.py @@ -39,9 +39,8 @@ def main(): cursor.execute(table.create_script().sql()) # Load data into redshift with upsert query - sql = args.sql % tuple(sql_arguments) - print 'Running :', sql - cursor.execute(sql) + print cursor.mogrify(args.sql, tuple(sql_arguments)) + cursor.execute(args.sql, tuple(sql_arguments)) cursor.execute('COMMIT') # Analyze the table diff --git a/examples/example_create_update_sql.yaml b/examples/example_create_update_sql.yaml index e4ae06b..7169ecf 100644 --- a/examples/example_create_update_sql.yaml +++ b/examples/example_create_update_sql.yaml @@ -9,5 +9,8 @@ steps: command: | DELETE FROM dev.test_table WHERE id < 0; INSERT INTO dev.test_table - SELECT * FROM dev.test_table_2; + SELECT * FROM dev.test_table_2 + WHERE id < %s; table_definition: tables/dev.test_table.sql + script_arguments: + - 4 From 5963991250c682b6067d3a4e1ee9aea6576f9ee0 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Tue, 17 Mar 2015 14:23:02 -0700 Subject: [PATCH 171/175] Only convert the timedelta once, instead of for every pipeline --- bin/dataduct | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/bin/dataduct b/bin/dataduct index a156915..b2bf29b 100755 --- a/bin/dataduct +++ b/bin/dataduct @@ -35,13 +35,16 @@ def initialize_etl_objects(pipeline_definitions, time_delta=None, from dataduct.etl import create_pipeline from dataduct.etl import read_pipeline_definition + # Convert the time_delta if it exists + if time_delta is not None: + time_delta = timedelta(seconds=parse(time_delta)) + if backfill: + time_delta *= -1 + etls = [] for pipeline_definition in pipeline_definitions: definition = read_pipeline_definition(pipeline_definition) if time_delta is not None: - time_delta = timedelta(seconds=parse(time_delta)) - if backfill: - time_delta *= -1 definition.update({'time_delta': time_delta}) if frequency_override is not None: definition.update({'frequency': frequency_override}) From 1c9cd640c43e2dce48f152d5d16ad5337bbc058f Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Tue, 17 Mar 2015 17:01:47 -0700 Subject: [PATCH 172/175] Only send args if args actually exist --- dataduct/steps/create_update_sql.py | 2 +- dataduct/steps/scripts/sql_runner.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/dataduct/steps/create_update_sql.py b/dataduct/steps/create_update_sql.py index ff0993f..b196ea9 100644 --- a/dataduct/steps/create_update_sql.py +++ b/dataduct/steps/create_update_sql.py @@ -56,7 +56,7 @@ def __init__(self, if script_arguments is not None: if not isinstance(script_arguments, list): raise ETLInputError( - 'Script arguments for SQL steps should be dictionary') + 'Script arguments for SQL steps should be a list') arguments.extend(script_arguments) super(CreateUpdateSqlStep, self).__init__( diff --git a/dataduct/steps/scripts/sql_runner.py b/dataduct/steps/scripts/sql_runner.py index e7e579c..63a9ace 100644 --- a/dataduct/steps/scripts/sql_runner.py +++ b/dataduct/steps/scripts/sql_runner.py @@ -1,8 +1,6 @@ #!/usr/bin/env python - """Runner for the upsert SQL step """ - import argparse import pandas.io.sql as pdsql from dataduct.data_access import redshift_connection @@ -39,8 +37,14 @@ def main(): cursor.execute(table.create_script().sql()) # Load data into redshift with upsert query - print cursor.mogrify(args.sql, tuple(sql_arguments)) - cursor.execute(args.sql, tuple(sql_arguments)) + # If there are sql_arguments, place them along with the query + # Otherwise, don't include them to avoid having to use %% everytime + if len(sql_arguments) > 1: + print cursor.mogrify(args.sql, tuple(sql_arguments)) + cursor.execute(args.sql, tuple(sql_arguments)) + else: + print args.sql + cursor.execute(args.sql) cursor.execute('COMMIT') # Analyze the table From 9e4ba0b71352aa9716cb62c558bfeef6f7a43c98 Mon Sep 17 00:00:00 2001 From: Jerry Jiang Date: Tue, 17 Mar 2015 17:25:39 -0700 Subject: [PATCH 173/175] Account for len(args) == 1 --- dataduct/steps/scripts/sql_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataduct/steps/scripts/sql_runner.py b/dataduct/steps/scripts/sql_runner.py index 63a9ace..ec9f749 100644 --- a/dataduct/steps/scripts/sql_runner.py +++ b/dataduct/steps/scripts/sql_runner.py @@ -39,7 +39,7 @@ def main(): # Load data into redshift with upsert query # If there are sql_arguments, place them along with the query # Otherwise, don't include them to avoid having to use %% everytime - if len(sql_arguments) > 1: + if len(sql_arguments) >= 1: print cursor.mogrify(args.sql, tuple(sql_arguments)) cursor.execute(args.sql, tuple(sql_arguments)) else: From 916fd52875d8bf002f85e0afda7e6b3faa67f5e1 Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 19 Mar 2015 18:04:40 -0700 Subject: [PATCH 174/175] Documentation --- dataduct/config/credentials.py | 4 +- dataduct/pipeline/utils.py | 8 +- docs/conf.py | 2 +- docs/config.rst | 288 +++++++++++++ docs/creating_an_etl.rst | 141 +----- docs/dataduct.config.rst | 61 +++ docs/dataduct.config.tests.rst | 22 + docs/dataduct.data_access.rst | 22 + docs/dataduct.database.parsers.rst | 69 +++ docs/dataduct.database.parsers.tests.rst | 46 ++ docs/dataduct.database.rst | 79 ++++ docs/dataduct.database.sql.rst | 53 +++ docs/dataduct.database.sql.tests.rst | 38 ++ docs/dataduct.database.tests.rst | 30 ++ docs/dataduct.etl.rst | 45 ++ docs/dataduct.etl.tests.rst | 30 ++ docs/dataduct.pipeline.rst | 3 + docs/dataduct.qa.rst | 54 +++ docs/dataduct.rst | 27 +- docs/dataduct.steps.rst | 80 ++++ docs/dataduct.tests.rst | 6 +- docs/dataduct.utils.rst | 24 ++ docs/etl_pipeline.rst | 7 - docs/index.rst | 33 +- docs/input_output.rst | 180 ++++++++ docs/installation.rst | 134 +++--- docs/introduction.rst | 48 +++ docs/modules.rst | 7 + docs/steps.rst | 520 +++++++++++++++++++++++ 29 files changed, 1816 insertions(+), 245 deletions(-) create mode 100644 docs/config.rst create mode 100644 docs/dataduct.config.rst create mode 100644 docs/dataduct.config.tests.rst create mode 100644 docs/dataduct.data_access.rst create mode 100644 docs/dataduct.database.parsers.rst create mode 100644 docs/dataduct.database.parsers.tests.rst create mode 100644 docs/dataduct.database.rst create mode 100644 docs/dataduct.database.sql.rst create mode 100644 docs/dataduct.database.sql.tests.rst create mode 100644 docs/dataduct.database.tests.rst create mode 100644 docs/dataduct.etl.rst create mode 100644 docs/dataduct.etl.tests.rst create mode 100644 docs/dataduct.qa.rst delete mode 100644 docs/etl_pipeline.rst create mode 100644 docs/input_output.rst create mode 100644 docs/introduction.rst create mode 100644 docs/modules.rst create mode 100644 docs/steps.rst diff --git a/dataduct/config/credentials.py b/dataduct/config/credentials.py index 3afa75a..cea9a62 100644 --- a/dataduct/config/credentials.py +++ b/dataduct/config/credentials.py @@ -10,8 +10,8 @@ def get_aws_credentials_from_iam(): """Get aws credentials using the IAM api Note: this script only runs on an EC2 instance with the appropriate resource roles. For more information, see the following: - http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/\ - AESDG-chapter-instancedata.html + http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/\ + AESDG-chapter-instancedata.html Returns: access_key(str): AWS access key diff --git a/dataduct/pipeline/utils.py b/dataduct/pipeline/utils.py index 4507a5c..1817119 100644 --- a/dataduct/pipeline/utils.py +++ b/dataduct/pipeline/utils.py @@ -47,16 +47,16 @@ def get_response_from_boto(fn, *args, **kwargs): Args: func(function): Function to call - *args(optional): arguments - **kwargs(optional): keyword arguments + args(optional): arguments + kwargs(optional): keyword arguments Returns: response(json): request response. Input: func(function): Function to call - *args(optional): arguments - **kwargs(optional): keyword arguments + args(optional): arguments + kwargs(optional): keyword arguments """ response = None diff --git a/docs/conf.py b/docs/conf.py index db9615f..46a24fb 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -260,7 +260,7 @@ # dir menu entry, description, category) texinfo_documents = [ ('index', 'dataduct', u'dataduct Documentation', - u'Coursera', 'dataduct', 'One line description of project.', + u'Coursera', 'dataduct', 'DataPipeline for Humans.', 'Miscellaneous'), ] diff --git a/docs/config.rst b/docs/config.rst new file mode 100644 index 0000000..e0d01d6 --- /dev/null +++ b/docs/config.rst @@ -0,0 +1,288 @@ +Config +====== + +All the dataduct setting are controlled from a single config file that +stores the credentials as well as different settings. + +The config file is read from the following places in the specified order +of priority. + +1. ``/etc/dataduct.cfg`` +2. ``~/.dataduct`` +3. ``DATADUCT_CONFIG_PATH`` environment variable + +Minimum example config: + +.. code:: YAML + + ec2: + INSTANCE_TYPE: m1.large + ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - Python 2.6 + SECURITY_GROUP: FILL_ME_IN + + emr: + MASTER_INSTANCE_TYPE: m1.large + NUM_CORE_INSTANCES: 1 + CORE_INSTANCE_TYPE: m1.large + CLUSTER_AMI: 3.1.0 + + etl: + S3_ETL_BUCKET: FILL_ME_IN + ROLE: FILL_ME_IN + RESOURCE_ROLE: FILL_ME_IN + +Config Parameters +----------------- + +Bootstrap +~~~~~~~~~ + +.. code:: YAML + + bootstrap: + ec2: + - step_type: transform + command: echo "Welcome to dataduct" + no_output: true + emr: + - step_type: transform + command: echo "Welcome to dataduct" + no_output: true + +Bootstrap steps are a chain of steps that should be executed before any +other step in the datapipeline. This can be used to copy files from S3 +or install libraries on the resource. At Coursera we use this to +download some binaries from S3 that are required for some of the +transformations. + +Note that the EMR bootstrap is only executed on the master node. If you +want to install something on the task nodes then you should use the +bootstrap parameter in the ``emr_cluster_config`` in your datapipeline. + +Custom Steps +~~~~~~~~~~~~ + +:: + + custom_steps: + - class_name: CustomExtractLocalStep + file_path: custom_extract_local.py + step_type: custom-extract-local + +Custom steps are steps that are not part of dataduct but are created to +augment the functionality provided by dataduct. At Coursera these are +often Steps that Inherit from the current object but abstract out some +of the functionality so that multiple pipelines don't have to write the +same thing twice. + +The file\_path can be an absolute path or a relative path with respect +to the ``CUSTOM_STEPS_PATH`` path defined in the ETL parameter field. +The Step classes are dynamically imported based on the config and +``step-type`` field is the one that is matched when parsing the pipeline +definition. + +Database +~~~~~~~~ + +:: + + database: + permissions: + - user: admin + permission: all + - group: consumer_group + permission: select + +Some steps such as ``upsert`` or ``create-load-redshift`` create tables +and grant them appropriate permissions so that one does not have to +create tables prior to running the ETL. The permission is the +``permission`` being granted on the table or view to the ``user`` or +``group``. If both are specified then both the grant statements are +executed. + +EC2 +~~~ + +:: + + ec2: + INSTANCE_TYPE: m1.small + ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - Python 2.6 + SECURITY_GROUP: FILL_ME_IN + +The ec2 config controls the configuration for the ec2-resource started +by the datapipeline. You can override these with ``ec2_resouce_config`` +in your pipeline definition for specific pipelines. + +EMR +~~~ + +:: + + emr: + CLUSTER_AMI: 3.1.0 + CLUSTER_TIMEOUT: 6 Hours + CORE_INSTANCE_TYPE: m1.large + NUM_CORE_INSTANCES: 1 + HADOOP_VERSION: 2.4.0 + HIVE_VERSION: null + MASTER_INSTANCE_TYPE: m3.xlarge + PIG_VERSION: null + TASK_INSTANCE_BID_PRICE: null + TASK_INSTANCE_TYPE: m1.large + +The emr config controls the configuration for the emr-resource started +by the datapipeline. + +ETL +~~~ + +:: + + etl: + CONNECTION_RETRIES: 2 + CUSTOM_STEPS_PATH: ~/dataduct/examples/steps + DAILY_LOAD_TIME: 1 + KEY_PAIR: FILL_ME_IN + MAX_RETRIES: 2 + NAME_PREFIX: dev + QA_LOG_PATH: qa + DP_INSTANCE_LOG_PATH: dp_instances + DP_PIPELINE_LOG_PATH: dp_pipelines + DP_QA_TESTS_LOG_PATH: dba_table_qa_tests + RESOURCE_BASE_PATH: ~/dataduct/examples/resources + RESOURCE_ROLE: FILL_ME_IN + RETRY_DELAY: 10 Minutes + REGION: us-east-1 + ROLE: FILL_ME_IN + S3_BASE_PATH: dev + S3_ETL_BUCKET: FILL_ME_IN + SNS_TOPIC_ARN_FAILURE: null + SNS_TOPIC_ARN_WARNING: null + FREQUENCY_OVERRIDE: one-time + DEPENDENCY_OVERRIDE: false + slack: + api_token: FILL_ME_IN + channel_name: "#dataduct" + username: FILL_ME_IN + bot_username: Dataduct Bot + TAGS: + env: + string: dev + Name: + variable: name + +This is the core parameter object which controls the ETL at the high +level. The parameters are explained below: + +- ``CONNECTION_RETRIES``: Number of retries for the database + connections. This is used to eliminate some of the transient errors + that might occur. +- ``CUSTOM_STEPS_PATH``: Path to the directory to be used for custom + steps that are specified using a relative path. +- ``DAILY_LOAD_TIME``: Default time to be used for running pipelines +- ``KEY_PAIR``: SSH key pair to be used in both the ec2 and the emr + resource. +- ``MAX_RETRIES``: Number of retries for the pipeline activities +- ``NAME_PREFIX``: Prefix all the pipeline names with this string +- ``QA_LOG_PATH``: Path prefix for all the QA steps when logging output + to S3 +- ``DP_INSTANCE_LOG_PATH``: Path prefix for DP instances to be logged + before destroying +- ``DP_PIPELINE_LOG_PATH``: Path prefix for DP pipelines to be logged +- ``DP_QA_TESTS_LOG_PATH``: Path prefix for QA tests to be logged +- ``RESOURCE_BASE_PATH``: Path to the directory used to relative + resource paths +- ``RESOURCE_ROLE``: Resource role needed for DP +- ``RETRY_DELAY``: Delay between each of activity retires +- ``REGION``: Region to run the datapipeline from +- ``ROLE``: Role needed for DP +- ``S3_BASE_PATH``: Prefix to be used for all S3 paths that are created + anywhere. This is used for splitting logs across multiple developer + or across production and dev +- ``S3_ETL_BUCKET``: S3 bucket to use for DP data, logs, source code + etc. +- ``SNS_TOPIC_ARN_FAILURE``: SNS to trigger for failed steps or + pipelines +- ``SNS_TOPIC_ARN_WARNING``: SNS to trigger for failed QA checks +- ``FREQUENCY_OVERRIDE``: Override every frequency given in a pipeline + with this unless overridden by CLI +- ``DEPENDENCY_OVERRIDE``: Will ignore the dependency step if set to + true. +- ``slack``: Configuration for posting messages on slack whenever a + pipeline is run +- ``Tags``: Tags to be added to the pipeline. The first key is the Tag + to be used, the second key is the type. If the type is string the + value is passed directly. If the type is variable then it looks up + the pipeline object for that variable. + +Logging +~~~~~~~ + +:: + + logging: + CONSOLE_DEBUG_LEVEL: INFO + FILE_DEBUG_LEVEL: DEBUG + LOG_DIR: ~/.dataduct + LOG_FILE: dataduct.log + +Settings for specifying where the logs should be outputted and debug +levels that should be used in the library code execution. + +MySQL +~~~~~ + +:: + + mysql: + host_alias_1: + HOST: FILL_ME_IN + PASSWORD: FILL_ME_IN + USERNAME: FILL_ME_IN + host_alias_2: + HOST: FILL_ME_IN + PASSWORD: FILL_ME_IN + USERNAME: FILL_ME_IN + +Rds (MySQL) database connections are stored in this parameter. The +pipeline definitions can refer to the host with the host\_alias. +``HOST`` refers to the full db hostname inside AWS. + +Redshift +~~~~~~~~ + +:: + + redshift: + CLUSTER_ID: FILL_ME_IN + DATABASE_NAME: FILL_ME_IN + HOST: FILL_ME_IN + PASSWORD: FILL_ME_IN + USERNAME: FILL_ME_IN + PORT: FILL_ME_IN + +Redshift database credentials that are used in all the steps that +interact with a warehouse. ``CLUSTER_ID`` is the first word of the +``HOST`` as this is used by ``RedshiftNode`` at a few places to identify +the cluster. + +Modes +~~~~~ + +:: + + production: + etl: + S3_BASE_PATH: prod + +Modes define override settings for running a pipeline. As config is a +singleton we can declare the overrides once and that should update the +config settings across all use cases. + +In the example we have a mode called ``production`` in which the +``S3_BASE_PATH`` is overridden to ``prod`` instead of whatever value was +specified in the defaults. + +At coursera one of the uses for modes is to change between the dev +redshift cluster to the production one when we deploy a new ETL. diff --git a/docs/creating_an_etl.rst b/docs/creating_an_etl.rst index 3e3d753..745ba58 100644 --- a/docs/creating_an_etl.rst +++ b/docs/creating_an_etl.rst @@ -35,19 +35,20 @@ Example: # PIPELINE STEPS steps: - step_type: extract-local - path: examples/resources/word_data.txt + path: data/word_data.txt - step_type: emr-streaming - mapper: examples/scripts/word_mapper.py - reducer: examples/scripts/word_reducer.py + mapper: scripts/word_mapper.py + reducer: scripts/word_reducer.py - step_type: transform - script: examples/scripts/s3_profiler.py + script: scripts/s3_profiler.py script_arguments: - --input=INPUT1_STAGING_DIR - --output=OUTPUT1_STAGING_DIR - -f + Header Information ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -90,135 +91,3 @@ Description The description allows the creator of the YAML file to clearly explain the purpose of the pipeline. - -Pipeline Steps -~~~~~~~~~~~~~~ - -The pipeline steps are very verbose and easy to understand, as they map -directly into Data Pipeline steps. Each step must have a type associated -with it (transform step / emr-streaming step) and should be named for -clarification purposes. The following lists every step type: - -emr-streaming -^^^^^^^^^^^^^ - -The *emr-streaming* step runs on a EMR instance configured from the -header. You can specify the bootstrap, mapper, and reducer files. - -.. code:: yaml - - - step_type: emr-streaming - mapper: examples/scripts/word_mapper.py - reducer: examples/scripts/word_reducer.py - -extract-local -^^^^^^^^^^^^^ - -The *extract-local* step will extract a local file (for example, a TSV -file) and write it to the output node. From there, the data can be -loaded into redshift or apply further transformations. - -.. code:: yaml - - - name: extract_local_step - step_type: extract-local - path: examples/resources/word_data.txt - -extract-rds -^^^^^^^^^^^ - -The *extract-rds* step extracts data from MySQL databases to S3. You can -also specify the SQL statement that you would like to execute. This -extraction will look for tables based on the host name and the database -name which needs to be pre-configured in ~/.dataduct - -.. code:: yaml - - - step_type: extract-rds - host_name: maestro - database: maestro - sql: | - SELECT * - FROM networks_network; - -extract-redshift -^^^^^^^^^^^^^^^^ - -The *extract-redshift* step extracts data from AWS Redshift (the host -and AWS details must be preconfigured in the ~/.dataduct file) into S3. - -.. code:: yaml - - - step_type: extract-redshift - schema: dev - table: categories - -extract-s3 -^^^^^^^^^^ - -The *extract-s3* step extracts files from a given S3 URI into the output -S3 node. - -.. code:: yaml - - - step_type: extract-s3 - file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py - -load-redshift -^^^^^^^^^^^^^ - -The *load-redshift* step loads data from the input nodes to the -specified Redshift table. Before specifying the Redshift table and -schema, the host and AWS details must be preconfigured in the -~/.dataduct file. For example, the following steps will upload a local -file into dev.test\_table - -.. code:: yaml - - - step_type: extract-local - path: examples/resources/test_table1.tsv - - - step_type: load-redshift - schema: dev - table: test_table - -sql-command -^^^^^^^^^^^ - -The *sql-command* step will execute a query in Redshift (the host and -AWS details must be preconfigured in the ~/.dataduct file). - -.. code:: yaml - - - step_type: sql-command - command: INSERT INTO dev.test_table VALUES (1, 'hello_etl'); - -transform -^^^^^^^^^ - -The *transform* step allows you to specify the input node, apply -transformations, and write to a specified output node. The -transformation can be in the form of a script or a UNIX command. - -.. code:: yaml - - # Unix Example - - step_type: transform - command: cp -r $INPUT1_STAGING_DIR/* $OUTPUT1_STAGING_DIR - input_node: - step1_a: step2_a - step1_b: step2_b - output: - - "step2_a" - - "step2_b" - - # Script Example - - step_type: transform - script: examples/scripts/s3_profiler.py - input_node: - step2_a: output1 - script_arguments: - - "-i=${INPUT1_STAGING_DIR}" - - "-o=${OUTPUT1_STAGING_DIR}" - - -f - diff --git a/docs/dataduct.config.rst b/docs/dataduct.config.rst new file mode 100644 index 0000000..ac2cf66 --- /dev/null +++ b/docs/dataduct.config.rst @@ -0,0 +1,61 @@ +dataduct.config package +======================= + +Subpackages +----------- + +.. toctree:: + + dataduct.config.tests + +Submodules +---------- + +dataduct.config.config module +----------------------------- + +.. automodule:: dataduct.config.config + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.config_actions module +------------------------------------- + +.. automodule:: dataduct.config.config_actions + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.constants module +-------------------------------- + +.. automodule:: dataduct.config.constants + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.credentials module +---------------------------------- + +.. automodule:: dataduct.config.credentials + :members: + :undoc-members: + :show-inheritance: + +dataduct.config.logger_config module +------------------------------------ + +.. automodule:: dataduct.config.logger_config + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.config + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.config.tests.rst b/docs/dataduct.config.tests.rst new file mode 100644 index 0000000..792d9fc --- /dev/null +++ b/docs/dataduct.config.tests.rst @@ -0,0 +1,22 @@ +dataduct.config.tests package +============================= + +Submodules +---------- + +dataduct.config.tests.test_credentials module +--------------------------------------------- + +.. automodule:: dataduct.config.tests.test_credentials + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.config.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.data_access.rst b/docs/dataduct.data_access.rst new file mode 100644 index 0000000..00c1eec --- /dev/null +++ b/docs/dataduct.data_access.rst @@ -0,0 +1,22 @@ +dataduct.data_access package +============================ + +Submodules +---------- + +dataduct.data_access.connection module +-------------------------------------- + +.. automodule:: dataduct.data_access.connection + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.data_access + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.parsers.rst b/docs/dataduct.database.parsers.rst new file mode 100644 index 0000000..3d2a44c --- /dev/null +++ b/docs/dataduct.database.parsers.rst @@ -0,0 +1,69 @@ +dataduct.database.parsers package +================================= + +Subpackages +----------- + +.. toctree:: + + dataduct.database.parsers.tests + +Submodules +---------- + +dataduct.database.parsers.create_table module +--------------------------------------------- + +.. automodule:: dataduct.database.parsers.create_table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.create_view module +-------------------------------------------- + +.. automodule:: dataduct.database.parsers.create_view + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.helpers module +---------------------------------------- + +.. automodule:: dataduct.database.parsers.helpers + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.select_query module +--------------------------------------------- + +.. automodule:: dataduct.database.parsers.select_query + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.transform module +------------------------------------------ + +.. automodule:: dataduct.database.parsers.transform + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.utils module +-------------------------------------- + +.. automodule:: dataduct.database.parsers.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.parsers + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.parsers.tests.rst b/docs/dataduct.database.parsers.tests.rst new file mode 100644 index 0000000..7c181ed --- /dev/null +++ b/docs/dataduct.database.parsers.tests.rst @@ -0,0 +1,46 @@ +dataduct.database.parsers.tests package +======================================= + +Submodules +---------- + +dataduct.database.parsers.tests.test_create_table module +-------------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_create_table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.tests.test_create_view module +------------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_create_view + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.tests.test_select_query module +-------------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_select_query + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.parsers.tests.test_transfrom module +----------------------------------------------------- + +.. automodule:: dataduct.database.parsers.tests.test_transfrom + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.parsers.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.rst b/docs/dataduct.database.rst new file mode 100644 index 0000000..4046783 --- /dev/null +++ b/docs/dataduct.database.rst @@ -0,0 +1,79 @@ +dataduct.database package +========================= + +Subpackages +----------- + +.. toctree:: + + dataduct.database.parsers + dataduct.database.sql + dataduct.database.tests + +Submodules +---------- + +dataduct.database.column module +------------------------------- + +.. automodule:: dataduct.database.column + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.database module +--------------------------------- + +.. automodule:: dataduct.database.database + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.history_table module +-------------------------------------- + +.. automodule:: dataduct.database.history_table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.relation module +--------------------------------- + +.. automodule:: dataduct.database.relation + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.select_statement module +----------------------------------------- + +.. automodule:: dataduct.database.select_statement + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.table module +------------------------------ + +.. automodule:: dataduct.database.table + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.view module +----------------------------- + +.. automodule:: dataduct.database.view + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.sql.rst b/docs/dataduct.database.sql.rst new file mode 100644 index 0000000..e438344 --- /dev/null +++ b/docs/dataduct.database.sql.rst @@ -0,0 +1,53 @@ +dataduct.database.sql package +============================= + +Subpackages +----------- + +.. toctree:: + + dataduct.database.sql.tests + +Submodules +---------- + +dataduct.database.sql.sql_script module +--------------------------------------- + +.. automodule:: dataduct.database.sql.sql_script + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.sql_statement module +------------------------------------------ + +.. automodule:: dataduct.database.sql.sql_statement + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.transaction module +---------------------------------------- + +.. automodule:: dataduct.database.sql.transaction + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.utils module +---------------------------------- + +.. automodule:: dataduct.database.sql.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.sql + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.sql.tests.rst b/docs/dataduct.database.sql.tests.rst new file mode 100644 index 0000000..59d7ced --- /dev/null +++ b/docs/dataduct.database.sql.tests.rst @@ -0,0 +1,38 @@ +dataduct.database.sql.tests package +=================================== + +Submodules +---------- + +dataduct.database.sql.tests.test_sql_script module +-------------------------------------------------- + +.. automodule:: dataduct.database.sql.tests.test_sql_script + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.tests.test_sql_statement module +----------------------------------------------------- + +.. automodule:: dataduct.database.sql.tests.test_sql_statement + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.sql.tests.test_sql_utils module +------------------------------------------------- + +.. automodule:: dataduct.database.sql.tests.test_sql_utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.sql.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.database.tests.rst b/docs/dataduct.database.tests.rst new file mode 100644 index 0000000..86cc9c8 --- /dev/null +++ b/docs/dataduct.database.tests.rst @@ -0,0 +1,30 @@ +dataduct.database.tests package +=============================== + +Submodules +---------- + +dataduct.database.tests.test_database module +-------------------------------------------- + +.. automodule:: dataduct.database.tests.test_database + :members: + :undoc-members: + :show-inheritance: + +dataduct.database.tests.test_history_table module +------------------------------------------------- + +.. automodule:: dataduct.database.tests.test_history_table + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.database.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.etl.rst b/docs/dataduct.etl.rst new file mode 100644 index 0000000..abde039 --- /dev/null +++ b/docs/dataduct.etl.rst @@ -0,0 +1,45 @@ +dataduct.etl package +==================== + +Subpackages +----------- + +.. toctree:: + + dataduct.etl.tests + +Submodules +---------- + +dataduct.etl.etl_actions module +------------------------------- + +.. automodule:: dataduct.etl.etl_actions + :members: + :undoc-members: + :show-inheritance: + +dataduct.etl.etl_pipeline module +-------------------------------- + +.. automodule:: dataduct.etl.etl_pipeline + :members: + :undoc-members: + :show-inheritance: + +dataduct.etl.utils module +------------------------- + +.. automodule:: dataduct.etl.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.etl + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.etl.tests.rst b/docs/dataduct.etl.tests.rst new file mode 100644 index 0000000..41a5eb0 --- /dev/null +++ b/docs/dataduct.etl.tests.rst @@ -0,0 +1,30 @@ +dataduct.etl.tests package +========================== + +Submodules +---------- + +dataduct.etl.tests.test_etl_actions module +------------------------------------------ + +.. automodule:: dataduct.etl.tests.test_etl_actions + :members: + :undoc-members: + :show-inheritance: + +dataduct.etl.tests.test_etl_pipeline module +------------------------------------------- + +.. automodule:: dataduct.etl.tests.test_etl_pipeline + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.etl.tests + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.pipeline.rst b/docs/dataduct.pipeline.rst index 8c4fbb5..b122eae 100644 --- a/docs/dataduct.pipeline.rst +++ b/docs/dataduct.pipeline.rst @@ -1,6 +1,9 @@ dataduct.pipeline package ========================= +Submodules +---------- + dataduct.pipeline.activity module --------------------------------- diff --git a/docs/dataduct.qa.rst b/docs/dataduct.qa.rst new file mode 100644 index 0000000..5cba802 --- /dev/null +++ b/docs/dataduct.qa.rst @@ -0,0 +1,54 @@ +dataduct.qa package +=================== + +Submodules +---------- + +dataduct.qa.check module +------------------------ + +.. automodule:: dataduct.qa.check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.column_check module +------------------------------- + +.. automodule:: dataduct.qa.column_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.count_check module +------------------------------ + +.. automodule:: dataduct.qa.count_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.primary_key_check module +------------------------------------ + +.. automodule:: dataduct.qa.primary_key_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.qa.utils module +------------------------ + +.. automodule:: dataduct.qa.utils + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: dataduct.qa + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/dataduct.rst b/docs/dataduct.rst index ceb30fe..6f37f8e 100644 --- a/docs/dataduct.rst +++ b/docs/dataduct.rst @@ -1,29 +1,16 @@ -Code References -================ - -.. automodule:: dataduct - :members: - :undoc-members: - :show-inheritance: - -Subpackages and Modules ------------------------ +Code documentation +================== .. toctree:: :maxdepth: 1 - etl_pipeline + dataduct.config + dataduct.data_access + dataduct.database + dataduct.etl dataduct.pipeline + dataduct.qa dataduct.s3 dataduct.steps dataduct.tests dataduct.utils - - -Definition Parser ------------------ - -.. automodule:: dataduct.definition_parser - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/dataduct.steps.rst b/docs/dataduct.steps.rst index b73f54d..e625b5e 100644 --- a/docs/dataduct.steps.rst +++ b/docs/dataduct.steps.rst @@ -4,6 +4,46 @@ dataduct.steps package Submodules ---------- +dataduct.steps.column_check module +---------------------------------- + +.. automodule:: dataduct.steps.column_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.count_check module +--------------------------------- + +.. automodule:: dataduct.steps.count_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.create_load_redshift module +------------------------------------------ + +.. automodule:: dataduct.steps.create_load_redshift + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.create_update_sql module +--------------------------------------- + +.. automodule:: dataduct.steps.create_update_sql + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.emr_job module +----------------------------- + +.. automodule:: dataduct.steps.emr_job + :members: + :undoc-members: + :show-inheritance: + dataduct.steps.emr_streaming module ----------------------------------- @@ -60,6 +100,38 @@ dataduct.steps.load_redshift module :undoc-members: :show-inheritance: +dataduct.steps.pipeline_dependencies module +------------------------------------------- + +.. automodule:: dataduct.steps.pipeline_dependencies + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.primary_key_check module +--------------------------------------- + +.. automodule:: dataduct.steps.primary_key_check + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.qa_transform module +---------------------------------- + +.. automodule:: dataduct.steps.qa_transform + :members: + :undoc-members: + :show-inheritance: + +dataduct.steps.reload module +---------------------------- + +.. automodule:: dataduct.steps.reload + :members: + :undoc-members: + :show-inheritance: + dataduct.steps.sql_command module --------------------------------- @@ -76,6 +148,14 @@ dataduct.steps.transform module :undoc-members: :show-inheritance: +dataduct.steps.upsert module +---------------------------- + +.. automodule:: dataduct.steps.upsert + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/dataduct.tests.rst b/docs/dataduct.tests.rst index 7a1df98..215751d 100644 --- a/docs/dataduct.tests.rst +++ b/docs/dataduct.tests.rst @@ -4,10 +4,10 @@ dataduct.tests package Submodules ---------- -dataduct.tests.test_definition_parser module --------------------------------------------- +dataduct.tests.test_import module +--------------------------------- -.. automodule:: dataduct.tests.test_definition_parser +.. automodule:: dataduct.tests.test_import :members: :undoc-members: :show-inheritance: diff --git a/docs/dataduct.utils.rst b/docs/dataduct.utils.rst index a25391f..cfe860e 100644 --- a/docs/dataduct.utils.rst +++ b/docs/dataduct.utils.rst @@ -4,6 +4,22 @@ dataduct.utils package Submodules ---------- +dataduct.utils.cli module +------------------------- + +.. automodule:: dataduct.utils.cli + :members: + :undoc-members: + :show-inheritance: + +dataduct.utils.constants module +------------------------------- + +.. automodule:: dataduct.utils.constants + :members: + :undoc-members: + :show-inheritance: + dataduct.utils.exceptions module -------------------------------- @@ -20,6 +36,14 @@ dataduct.utils.helpers module :undoc-members: :show-inheritance: +dataduct.utils.slack_hook module +-------------------------------- + +.. automodule:: dataduct.utils.slack_hook + :members: + :undoc-members: + :show-inheritance: + Module contents --------------- diff --git a/docs/etl_pipeline.rst b/docs/etl_pipeline.rst deleted file mode 100644 index 2f40774..0000000 --- a/docs/etl_pipeline.rst +++ /dev/null @@ -1,7 +0,0 @@ -ETLPipeline -=========== - -.. automodule:: dataduct.etl_pipeline - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/index.rst b/docs/index.rst index d70b6b6..e7d4bc4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -3,23 +3,40 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -Dataduct - DataPipeline for humans -==================================== +Dataduct +======== -Dataduct is a wrapper built on top of AWS Datapipeline which makes it easy to -create ETL jobs. All jobs can be specified as a series of steps in a YAML file -and would automatically be translated into datapipeline with appropriate -pipeline objects. + Dataduct - DataPipeline for humans + +`Dataduct `__ is a wrapper built +on top of `AWS +Datapipeline `__ +which makes it easy to create ETL jobs. All jobs can be specified as a +series of steps in a YAML file and would automatically be translated +into datapipeline with appropriate pipeline objects. + +Top features include visualizing pipeline dependencies, extract data +from different sources such as RDS, S3, Local files, run a series of +transformations using EC2 or EMR, load data into Redshift and complete +transformations inside redshift as well. QA steps for sanity checking +the data. + +It is easy to create custom steps to augment the DSL as per the +requirements. As well as running a backfill with the command line +interface. -Running an ETL is as simple as ``$ dataduct -a create pipeline.yaml`` Contents: .. toctree:: - :maxdepth: 1 + :maxdepth: 2 + introduction installation + config creating_an_etl + steps + input_output dataduct Indices and tables diff --git a/docs/input_output.rst b/docs/input_output.rst new file mode 100644 index 0000000..832ac20 --- /dev/null +++ b/docs/input_output.rst @@ -0,0 +1,180 @@ +Input and Output Nodes +======================= + +In dataduct, data is shared between two activities using S3. After a +step is finished, it saves its output to a file in S3 for successive +steps to read. Input and output nodes abstract this process, they +represent the S3 directories in which the data is stored. A step's input +node determines which S3 file it will read as input, and its output node +determines where it will store its output. In most cases, this +input-output node chain is taken care of by dataduct, but there are +situations where you may want finer control over this process. + +Input Nodes +~~~~~~~~~~~ + +The default behaviour of steps (except Extract- and Check-type steps) is +to link its input node with the preceding step's output node. For +example, in this pipeline snippet + +:: + + - step_type: extract-local + path: data/test_table1.tsv + + - step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +the output of the ``extract-local`` step is fed into the +``create-load-redshift`` step, so the pipeline will load the data found +inside ``data/test_table1.tsv`` into ``dev.test_table.sql``. This +behaviour can be made explicit through the ``name`` and ``input_node`` +properties. + +:: + + # This pipeline has the same behaviour as the previous pipeline. + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table.sql + +When an input -> output node link is created, implicitly or explicitly, +dependencies are created automatically between the two steps. This +behaviour can be made explicit through the ``depends_on`` property. + +:: + + # This pipeline has the same behaviour as the previous pipeline. + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: create-load-redshift + input_node: extract_data + depends_on: extract_data + table_definition: tables/dev.test_table.sql + +You can use input nodes to communicate between steps that are not next +to each other. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: extract-local + path: data/test_table2.tsv + + # This step will use the output of the first extract-local step (test_table1.tsv) + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table.sql + +Without the use of ``input_node``, the ``create-load-redshift`` step +would have used the data from ``test_table2.tsv`` instead. + +You can also use input nodes to reuse the output of a step. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table1.sql + + - step_type: create-load-redshift + input_node: extract_data + table_definition: tables/dev.test_table2.sql + +Sometimes, you may not want a step to have any input nodes. You can +specify this by writing ``input_node: []``. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + # This step will not receive any input data + - step_type: transform + input_node: [] + script: scripts/example_script.py + +If you are running your own script (e.g. through the Transform step), +the input node's data can be found in the directory specified by +``INPUT1_STAGING_DIR``. + +:: + + - step_type: extract-local + name: extract_data + path: data/test_table1.tsv + + # manipulate_data.py takes in the input directory as a script argument + - step_type: transform + script: scripts/manipulate_data.py + script_arguments: + - --input=INPUT1_STAGING_DIR + +Output Nodes +~~~~~~~~~~~~ + +Dataduct usually handles a step's output nodes automatically, saving the +file into a default path in S3. You can set the default path through +your dataduct configuration file. However, some steps also have an +optional ``output_path`` property, allowing you to choose an S3 +directory to store the step's output. + +Transform Step and Output Nodes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Transform steps allow you to run your own scripts. If you want to save +the results of your script, you can store data into the output node by +writing to the directory specified by ``OUTPUT1_STAGING_DIR``. + +:: + + # generate_data.py takes in the output directory as a script argument + - step_type: transform + script: scripts/generate_data.py + script_arguments: + - --output=OUTPUT1_STAGING_DIR + + - step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + +You may wish to output more than one set of data for multiple proceeding +steps to use. You can do this through the ``output_node`` property. + +:: + + - step_type: transform + script: scripts/generate_data.py + script_arguments: + - --output=OUTPUT1_STAGING_DIR + output_node: + - foo_data + - bar_data + + - step_type: create-load-redshift + input_node: foo_data + table_definition: tables/dev.test_table1.sql + + - step_type: create-load-redshift + input_node: bar_data + table_definition: tables/dev.test_table2.sql + +In this case, the script must save data to subdirectories with names +matching the output nodes. In the above example, ``generate_data.py`` +must save data in ``OUTPUT1_STAGING_DIR/foo_data`` and +``OUTPUT1_STAGING_DIR/bar_data`` directories. If the subdirectory and +output node names are mismatched, the output nodes will not be generated +correctly. diff --git a/docs/installation.rst b/docs/installation.rst index 6d90ea5..178e02d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,79 +1,85 @@ Installation -~~~~~~~~~~~~ +============ -Install the dataduct package using pip +Installation using pip +---------------------- + +Dataduct can easily be installed using pip with the following commands. :: pip install dataduct -**Dependencies** +The major dependencies of dataduct are: + +- ``boto`` greater than version 2.34, older versions are missing some + of the functionality provided by EMR +- ``PyYAML`` +- ``pandas`` +- ``psycopg2`` +- ``pytimeparse`` +- ``MySQL-python`` +- ``pyparsing`` +- ``testfixtures`` + +The visualizations are created using: + +- ``graphviz`` +- ``pygraphviz`` + +Autocomplete for the CLI is supported using: + +- ``argcomplete`` + +The documentation is created using: + +- ``sphinx`` +- ``sphinx-napolean`` +- ``sphinx_rtd_theme`` + +Installing in the developer environment +--------------------------------------- + +1. Clone the Repo +^^^^^^^^^^^^^^^^^ + +:: + + git clone https://github.com/coursera/dataduct.git + +2. Update PATH and PYTHONPATH +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Add these lines into your ``.bash_profile`` or ``.zshrc`` etc based on +your shell type. + +:: -dataduct currently has the following dependencies: - boto >= 2.32.0 - -yaml + export PYTHONPATH=~/dataduct:$PYTHONPATH + export PATH=~/dataduct/bin:$PATH -We have tried some older versions of boto with the problem being support -some functionality around EMR that will be used in the later versions of -dataduct. +3. Config +^^^^^^^^^ -**Setup Configuration** +Create a config file. Instructions for this are provided in the config +section. + +Setup Autocomplete +------------------ + +Install argcomplete with ``pip install argcomplete``. + +If you're using ``bash`` then add the following to your +``.bash_profile``: + +:: -Setup the configuration file to set the credentials and defaul values -for various parameters passed to datapipeline. Copy the config template -from https://github.com/coursera/dataduct/../example\_config and write -it to ``~/.dataduct`` or ``/etc/.dataduct``. You can also set an -environment variable pointing to the config file location by setting the -``DATADUCT_PATH`` variable. + eval "$(register-python-argcomplete dataduct)" -*Config file template:* +if you're using ``zsh`` then add the following line to your ``.zshrc``: :: - # Constants that are used across the dataduct library - - ec2: - ROLE: FILL_ME_IN - RESOURCE_ROLE: FILL_ME_IN - INSTANCE_TYPE: m1.large - ETL_AMI: ami-05355a6c # Default AMI used by data pipeline - KEY_PAIR: FILL_ME_IN - SECURITY_GROUP: FILL_ME_IN - - emr: - NUM_CORE_INSTANCES: 3 - CORE_INSTANCE_TYPE: m1.large - TASK_INSTANCE_BID_PRICE: null # null if we want it to be None - TASK_INSTANCE_TYPE: m1.large - MASTER_INSTANCE_TYPE: m1.large - CLUSTER_TIMEOUT: 6 Hours - HADOOP_VERSION: null - HIVE_VERSION: null - PIG_VERSION: null - CLUSTER_AMI: 2.4.7 - - redshift: - DATABASE_NAME: FILL_ME_IN - CLUSTER_ID: FILL_ME_IN - USERNAME: FILL_ME_IN - PASSWORD: FILL_ME_IN - - mysql: - DATABASE_KEY: - HOST: FILL_ME_IN, - USERNAME: FILL_ME_IN, - PASSWORD: FILL_ME_IN - - etl: - RETRY_DELAY: 10 Minutes - MAX_RETRIES: 0 - S3_ETL_BUCKET: FILL_ME_IN - SNS_TOPIC_ARN_FAILURE: FILL_ME_IN - SNS_TOPIC_ARN_WARNING: FILL_ME_IN - DAILY_LOAD_TIME: 1 # run at 1AM UTC - - bootstrap: - - step_type: transform - input_node: [] - command: whoami >> ${OUTPUT1_STAGING_DIR}/output.txt - resource: FILL_ME_IN - name: bootstrap_transform + autoload bashcompinit + bashcompinit + eval "$(register-python-argcomplete dataduct)" diff --git a/docs/introduction.rst b/docs/introduction.rst new file mode 100644 index 0000000..de1ecaf --- /dev/null +++ b/docs/introduction.rst @@ -0,0 +1,48 @@ +Introduction +============= + +`Dataduct `__ is a wrapper built +on top of `AWS +Datapipeline `__ +which makes it easy to create ETL jobs. All jobs can be specified as a +series of steps in a YAML file and would automatically be translated +into datapipeline with appropriate pipeline objects. + +Top features include visualizing pipeline dependencies, extract data +from different sources such as RDS, S3, Local files, run a series of +transformations using EC2 or EMR, load data into Redshift and complete +transformations inside redshift as well. QA steps for sanity checking +the data. + +It is easy to create custom steps to augment the DSL as per the +requirements. As well as running a backfill with the command line +interface. + +An example ETL from RDS would look like: + +.. code:: YAML + + name: example_upsert + frequency: daily + load_time: 01:00 # Hour:Min in UTC + + steps: + - step_type: extract-rds + host_name: test_host + database: test_database + sql: | + SELECT * + FROM test_table; + + - step_type: create-load-redshift + table_definition: tables/dev.test_table.sql + + - step_type: upsert + source: tables/dev.test_table.sql + destination: tables/dev.test_table_2.sql + +This would first perform an extraction from the RDS database with the +``extract-rds`` step using the ``COPY ACTIVITY``. Then load the data +into the ``dev.test_table`` in redshift with the +``create-load-redshift``. Then perform an ``upsert`` with the data into +the ``test_table_2``. diff --git a/docs/modules.rst b/docs/modules.rst new file mode 100644 index 0000000..e7b9c81 --- /dev/null +++ b/docs/modules.rst @@ -0,0 +1,7 @@ +dataduct +======== + +.. toctree:: + :maxdepth: 4 + + dataduct diff --git a/docs/steps.rst b/docs/steps.rst new file mode 100644 index 0000000..832ca1b --- /dev/null +++ b/docs/steps.rst @@ -0,0 +1,520 @@ +Steps and Pipeline Objects +========================== + +Pipeline objects are classes that directly translate one-one from the +dataduct classes to `DP +objects `__. +A step is an abstraction layer that can translate into one or more +pipeline objects based on the action type. For example a ``sql-command`` +step translates into a ``sql-activity`` or a ``transform`` step +translates into ``shell command activity`` and creates an output +``s3 node``. + +Definition of a Step +-------------------- + +A step is defined as a series of properties in yaml. For example, + +:: + + - step_type: extract-s3 + name: get_file + file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py + +defines an ``extract-s3`` step with properties ``name`` and +``file_uri``. + +Common +------ + +These are the properties that all steps possess. + +- ``step_type``: The step type. Must be either a pre-defined step or a + custom step. (Required) +- ``name``: The user-defined name of the step. Will show up as part of + the component name in DataPipeline. +- ``input_node``: See input and output nodes. +- ``depends_on``: This step will not run until the step(s) specified + have finished. + +Extract S3 +---------- + +Extracts the contents from the specified file or directory in S3. May +used as input to other steps. + +Properties +^^^^^^^^^^ + +One of: (Required) + +- ``file_uri``: The location of a single file in S3. +- ``directory_uri``: The location of a directory in S3. + +Example +^^^^^^^ + +:: + + - step_type: extract-s3 + file_uri: s3://elasticmapreduce/samples/wordcount/wordSplitter.py + +Extract Local +------------- + +Extracts the contents from the specified file locally. May be used as +input to other steps. May only be used with one-time pipelines. + +Properties +^^^^^^^^^^ + +- ``path``: The location of a single file. (Required) + +Example +^^^^^^^ + +:: + + - step_type: extract-local + path: data/example_file.tsv + +Extract RDS +----------- + +Extracts the contents of a table from an RDS instance. May be used as +input to other steps. Data is stored in TSV format. + +Properties +^^^^^^^^^^ + +- ``host_name``: The host name to lookup in the ``mysql`` section of + the configuration file. (Required) +- ``database``: The database in the RDS instance in which the table + resides. (Required) +- ``output_path``: Output the extracted data to the specified S3 path. + +One of: (Required) + +- ``sql``: The SQL query to execute to extract data. +- ``table``: The table to extract. Equivalent to a sql query of + ``SELECT * FROM table``. + +Example +^^^^^^^ + +:: + + - step_type: extract-rds + host_name: maestro + database: maestro + sql: | + SELECT * + FROM example_rds_table; + +Extract Redshift +------------------------- + +Extracts the contents of a table from a Redshift instance. May be used +as input to other steps. Data is stored in TSV format. + +Properties +^^^^^^^^^^ + +- ``schema``: The schema of the table. (Required) +- ``table``: The name of the table. (Required) +- ``output_path``: Output the extracted data to the specified S3 path. + Optional. + +Example +^^^^^^^ + +:: + + - step_type: extract-redshift + schema: prod + table: example_redshift_table + +Transform +------------------------- + +Runs a specified script on an resource. + +Properties +^^^^^^^^^^ + +- ``output_node``: See input and output nodes. +- ``script_arguments``: Arguments passed to the script. +- ``script_name``: Required if ``script_directory`` is specified. + Script to be executed in the directory. +- ``additional_s3_files``: Additional files to include from S3. +- ``output_path``: Save the script's output to the specified S3 path. +- ``no_output``: If ``true``, step will produce no extractable output. + Default: ``false`` + +One of: (Required) + +- ``command``: A command to be executed directly. +- ``script``: Local path to the script that should be executed. +- ``script_directory``: Local path to a directory of scripts to be + uploaded to the resource. + +Example +^^^^^^^ + +:: + + - step_type: transform + script: scripts/example_script.py + script_arguments: + - "--foo=bar" + +SQL Command +------------------------- + +Executes a SQL statement in a Redshift instance. + +Properties +^^^^^^^^^^ + +- ``script_arguments``: Arguments passed to the SQL command. +- ``queue``: Query queue that should be used. +- ``wrap_transaction``: If ``true``, SQL command will be wrapped inside + a transaction. Default: ``true`` + +One of: (Required) + +- ``command``: Command to be executed directly. +- ``script``: Local path to the script that should be executed. + +Example +^^^^^^^ + +:: + + - step_type: sql-command + command: SELECT * FROM dev.test_table; + +EMR Streaming +------------------------- + +Executes a map and an optional reduce script using Amazon Elastic +MapReduce. + +Properties +^^^^^^^^^^ + +- ``mapper``: Local path to the mapper script (Required) +- ``reducer``: Local path to the reducer script +- ``hadoop_params``: List of arguments to the hadoop command +- ``output_path``: Save the script's output to the specified S3 path + +Example +^^^^^^^ + +:: + + - step_type: emr-streaming + mapper: scripts/word_mapper.py + reducer: scripts/word_reducer.py + +Load Redshift +------------------------- + +Loads the data from its input node into a Redshift instance. + +Properties +^^^^^^^^^^ + +- ``schema``: The schema of the table. (Required) +- ``table``: The name of the table. (Required) +- ``insert_mode``: See Amazon's RedshiftCopyActivity documentation. + Default: TRUNCATE +- ``max_errors``: The maximum number of errors to be ignored during the + load +- ``replace_invalid_char``: Character to replace non-utf8 characters + with + +Example +^^^^^^^ + +:: + + - step_type: load-redshift + schema: dev + table: example_table + +Pipeline Dependencies +------------------------- + +Keeps running until another pipeline has finished. Use with +``depends_on`` properties to stall the pipeline. + +Properties +^^^^^^^^^^ + +- ``dependent_pipelines``: List of pipelines to wait for. (Required) +- ``refresh_rate``: Time, in seconds, to wait between polls. Default: + 300 +- ``start_date``: Date on which the pipelines started at. Default: + Current day + +Example +^^^^^^^ + +:: + + - step_type: pipeline-dependencies + refresh_rate: 60 + dependent_pipelines: + - example_transform + +Create Load Redshift +------------------------- + +Special transform step that loads the data from its input node into a +Redshift instance. If the table it's loading into does not exist, the +table will be created. + +Properties +^^^^^^^^^^ + +- ``table_definition``: Schema file for the table to be loaded. + (Required) +- ``script_arguments``: Arguments for the runner. + + - ``--max_error``: The maximum number of errors to be ignored during + the load. Usage: ``--max_error=5`` + - ``--replace_invalid_char``: Character the replace non-utf8 + characters with. Usage: ``--replace_invalid_char='?'`` + - ``--no_escape``: If passed, does not escape special characters. + Usage: ``--no_escape`` + - ``--gzip``: If passed, compresses the output with gzip. Usage: + ``--gzip`` + - ``--command_options``: A custom SQL string as the options for the + copy command. Usage: ``--command_options="DELIMITER '\t'"`` + + - Note: If ``--command_options`` is passed, script arguments + ``--max_error``, ``--replace_invalid_char``, ``--no_escape``, + and ``--gzip`` have no effect. + +Example +^^^^^^^ + +:: + + - step_type: create-load-redshift + table_definition: tables/dev.example_table.sql + +Upsert +------------------------- + +Extracts data from a Redshift instance and upserts the data into a +table. Upsert = Update + Insert. If a row already exists (by matching +primary keys), the row will be updated. If the row does not already +exist, insert the row. If the table it's upserting into does not exist, +the table will be created. + +Properties +^^^^^^^^^^ + +- ``destination``: Schema file for the table to upsert into. (Required) +- ``enforce_primary_key``: If true, de-duplicates data by matching + primary keys. Default: true +- ``history``: Schema file for the history table to record the changes + in the destination table. +- ``analyze_table``: If true, runs ``ANALYZE`` on the table afterwards. + Default: true + +One of: (Required) + +- ``sql``: The SQL query to run to extract data. +- ``script``: Local path to a SQL query to run. +- ``source``: The table to extract. Equivalent to a sql query of + ``SELECT * FROM source``. + +Example +^^^^^^^ + +:: + + - step_type: upsert + source: tables/dev.example_table.sql + destination: tables/dev.example_table_2.sql + +Reload +------------------------- + +Extracts data from a Redshift instance and reloads a table with the +data. If the table it's reloading does not exist, the table will be +created. + +Properties +^^^^^^^^^^ + +- ``destination``: Schema file for the table to reload. (Required) +- ``enforce_primary_key``: If true, de-duplicates data by matching + primary keys. Default: true +- ``history``: Schema file for the history table to record the changes + in the destination table. +- ``analyze_table``: If true, runs ``ANALYZE`` on the table afterwards. + Default: true + +One of: (Required) + +- ``sql``: The SQL query to run to extract data. +- ``script``: Local path to a SQL query to run. +- ``source``: The table to extract. Equivalent to a sql query of + ``SELECT * FROM source``. + +Example +^^^^^^^ + +:: + + - step_type: reload + source: tables/dev.example_table.sql + destination: tables/dev.example_table_2.sql + +Create Update SQL +------------------------- + +Creates a table if it exists and then runs a SQL command. + +Properties +^^^^^^^^^^ + +- ``table_definition``: Schema file for the table to create. (Required) +- ``script_arguments``: Arguments for the SQL script. +- ``non_transactional``: If true, does not wrap the command in a + transaction. Default: false +- ``analyze_table``: If true, runs ``ANALYZE`` on the table afterwards. + Default: true + +One of: (Required) + +- ``command``: SQL command to execute directly. +- ``script``: Local path to a SQL command to run. + +Example +^^^^^^^ + +:: + + - step_type: create-update-sql + command: | + DELETE FROM dev.test_table WHERE id < 0; + INSERT INTO dev.test_table + SELECT * FROM dev.test_table_2 + WHERE id < %s; + table_definition: tables/dev.test_table.sql + script_arguments: + - 4 + +Primary Key Check +------------------------- + +Checks for primary key violations on a specific table. + +Properties +^^^^^^^^^^ + +- ``table_definition``: Schema file for the table to check. (Required) +- ``script_arguments``: Arguments for the runner script. +- ``log_to_s3``: If true, logs the output to a file in S3. Default: + false + +Example +^^^^^^^ + +:: + + - step_type: primary-key-check + table_definition: tables/dev.test_table.sql + +Count Check +------------------------- + +Compares the number of rows in the source and destination tables/SQL +scripts. + +Properties +^^^^^^^^^^ + +- ``source_host``: The source host name to lookup in the ``mysql`` + section of the configuration file. (Required) +- ``tolerance``: Tolerance threshold, in %, for the difference in count + between source and destination. Default: 1 +- ``log_to_s3``: If true, logs the output to a file in S3. Default: + false +- ``script``: Replace the default count script. +- ``script_arguments``: Arguments for the script. + +One of: (Required) + +- ``source_sql``: SQL query to select rows to count for the source. +- ``source_count_sql``: SQL query that returns a count for the source. +- ``source_table_name``: Name of source table to count. Equivalent to a + source\_count\_sql of ``SELECT COUNT(1) from source_table_name``. + +One of: (Required) + +- ``destination_sql``: SQL query to select rows to count for the + destination. +- ``destination_table_name``: Name of the destination table to count. +- ``destination_table_definition``: Schema file for the destination + table to count. + +Example +^^^^^^^ + +:: + + - step_type: count-check + source_sql: "SELECT id, name FROM networks_network;" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + tolerance: 2.0 + log_to_s3: true + +Column Check +------------------------- + +Compares a sample of rows from the source and destination tables/SQL +scripts to see if they match + +Properties +^^^^^^^^^^ + +- ``source_host``: The source host name to lookup in the ``mysql`` + section of the configuration file. (Required) +- ``source_sql``: SQL query to select rows to check for the source. + (Required) +- ``sql_tail_for_source``: Statement to append at the end of the SQL + query for the source +- ``sample_size``: Number of samples to check. Default: 100 +- ``tolerance``: Tolerance threshold, in %, for mismatched rows. + Default: 1 +- ``log_to_s3``: If true, logs the output to a file in S3. Default: + false +- ``script``: Replace the default column check script. +- ``script_arguments``: Arguments for the script. + +One of: (Required) + +- ``destination_sql``: SQL query to select rows to check for the + destination. +- ``destination_table_definition``: Schema file for the destination + table to check. + +Example +^^^^^^^ + +:: + + - step_type: column-check + source_sql: "SELECT id, name FROM networks_network;" + source_host: maestro + destination_sql: "SELECT network_id, network_name FROM prod.networks" + sql_tail_for_source: "ORDER BY RAND() LIMIT LIMIT_PLACEHOLDER" + sample_size: 10 + log_to_s3: true From a2bd9f62aafcc0ef5dfabd0f2219ff07fc24bf9e Mon Sep 17 00:00:00 2001 From: Sourabh Bajaj Date: Thu, 19 Mar 2015 22:59:46 -0700 Subject: [PATCH 175/175] convert paragraph to list --- CHANGES.md | 49 ++++++++++++++++++++++++++----------------- docs/index.rst | 13 +++++++----- docs/introduction.rst | 14 +++++++------ 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 545f227..f71fc65 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,33 +1,44 @@ # Changes in dataduct -### 0.1.0 -- Initial version of the dataduct library released -- Support for the following steps: - - emr_streaming - - extract-local - - extract-s3 - - extract-rds - - extract-redshift - - load-redshift - - sql-command - - transform -- Examples and documentation added for all the steps - ### 0.2.0 +- Travis integration for continous builds +- QA steps and logging to S3 +- Visualizing pipeline +- Dataduct CLI updated as a single entry point +- RDS connections for scripts +- Bootstrap step for pipelines +- Backfill or delay activation +- Output path and input path options +- Script directory for transform step +- SQL sanatization for DBA actions +- SQL parser for select and create table statements +- Logging across the library - Support for custom steps - Pipeline dependency step - Reduce verbosity of imports - Step parsing is isolated in steps - More examples for steps -- QA step functions added -- Visualization of pipelines - Sync config with S3 - Config overides with modes - Rename keywords and safe config failure handling -- MySQL and Redshift connection support - EMR Streaming support with hadoop 2 -- Custom EMR job step -- Support for input_path to steps to directly create S3Nodes -- Transform step to support directory based installs - Exceptions cleanup - Read the docs support +- Creating tables automatically for various steps +- History table support +- EC2 and EMR config control from YAML +- Slack integration +- Support for Regions in DP + +### 0.1.0 +- Initial version of the dataduct library released +- Support for the following steps: + - emr_streaming + - extract-local + - extract-s3 + - extract-rds + - extract-redshift + - load-redshift + - sql-command + - transform +- Examples and documentation added for all the steps diff --git a/docs/index.rst b/docs/index.rst index e7d4bc4..e615a80 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -15,11 +15,14 @@ which makes it easy to create ETL jobs. All jobs can be specified as a series of steps in a YAML file and would automatically be translated into datapipeline with appropriate pipeline objects. -Top features include visualizing pipeline dependencies, extract data -from different sources such as RDS, S3, Local files, run a series of -transformations using EC2 or EMR, load data into Redshift and complete -transformations inside redshift as well. QA steps for sanity checking -the data. +Features include: + +- Visualizing pipeline activities +- Extracting data from different sources such as RDS, S3, local files +- Transforming data using EC2 and EMR +- Loading data into redshift +- Transforming data inside redshift +- QA data between the source system and warehouse It is easy to create custom steps to augment the DSL as per the requirements. As well as running a backfill with the command line diff --git a/docs/introduction.rst b/docs/introduction.rst index de1ecaf..cdab355 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -8,12 +8,14 @@ which makes it easy to create ETL jobs. All jobs can be specified as a series of steps in a YAML file and would automatically be translated into datapipeline with appropriate pipeline objects. -Top features include visualizing pipeline dependencies, extract data -from different sources such as RDS, S3, Local files, run a series of -transformations using EC2 or EMR, load data into Redshift and complete -transformations inside redshift as well. QA steps for sanity checking -the data. - +Features include: + +- Visualizing pipeline activities +- Extracting data from different sources such as RDS, S3, local files +- Transforming data using EC2 and EMR +- Loading data into redshift +- Transforming data inside redshift +- QA data between the source system and warehouse It is easy to create custom steps to augment the DSL as per the requirements. As well as running a backfill with the command line interface.