diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000000..85feb30b6a --- /dev/null +++ b/.coveragerc @@ -0,0 +1,6 @@ +[report] +omit = + */python?.?/* + */site-packages/nose/* + *__init__* + *test/* diff --git a/.gitignore b/.gitignore index e1272e2367..0d47938e13 100644 --- a/.gitignore +++ b/.gitignore @@ -1,19 +1,162 @@ -*.pyc -*~ +.nicesetup + client.cfg -build -dist -luigi.egg-info + +hadoop_test.py +minicluster.py +mrrunner.py + packages.tar + test/data -hadoop_test.py -.nicesetup -.tox + +Vagrantfile + *.pickle *.rej *.orig -.DS_Store -.idea/ + + +# Created by https://www.gitignore.io + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml + +# Translations +*.mo +*.pot + +# Django stuff: +*.log + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + + +### Vim ### +[._]*.s[a-w][a-z] +[._]s[a-w][a-z] +*.un~ +Session.vim +.netrwhist +*~ + + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm + *.iml -Vagrantfile + +## Directory-based project format: +.idea/ +# if you remove the above rule, at least ignore the following: + +# User-specific stuff: +# .idea/workspace.xml +# .idea/tasks.xml +# .idea/dictionaries + +# Sensitive or high-churn files: +# .idea/dataSources.ids +# .idea/dataSources.xml +# .idea/sqlDataSources.xml +# .idea/dynamic.xml +# .idea/uiDesigner.xml + +# Gradle: +# .idea/gradle.xml +# .idea/libraries + +# Mongo Explorer plugin: +# .idea/mongoSettings.xml + +## File-based project format: +*.ipr +*.iws + +## Plugin-specific files: + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties + + +### Vagrant ### .vagrant/ + + +### OSX ### +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + + +# Thumbnails +._* + +# Files that might appear on external disk +.Spotlight-V100 +.Trashes + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk diff --git a/.travis.yml b/.travis.yml index 12e95c33f5..e0724e2dde 100644 --- a/.travis.yml +++ b/.travis.yml @@ -10,8 +10,11 @@ env: global: - PIP_DOWNLOAD_CACHE=$HOME/.pip-cache matrix: + - TOX_ENV=pep8 - TOX_ENV=cdh - TOX_ENV=hdp + - TOX_ENV=nonhdfs + - TOX_ENV=docs sudo: false @@ -21,6 +24,8 @@ cache: install: - pip install tox -script: tox -e $TOX_ENV +script: + - tox -e $TOX_ENV -after_failure: cat /home/travis/build/spotify/luigi/.tox/cdh/log/cdh-1.log +after_failure: + - cat /home/travis/build/spotify/luigi/.tox/cdh/log/cdh-1.log diff --git a/README.rst b/README.rst index dbe6735796..c21bcb785a 100644 --- a/README.rst +++ b/README.rst @@ -2,13 +2,24 @@ :alt: Luigi Logo :align: center - About Luigi ----------- -.. image:: https://travis-ci.org/spotify/luigi.svg?branch=master +.. image:: https://img.shields.io/travis/spotify/luigi/master.svg?style=flat :target: https://travis-ci.org/spotify/luigi +.. image:: https://img.shields.io/coveralls/spotify/luigi/master.svg?style=flat + :target: https://coveralls.io/r/spotify/luigi?branch=master + +.. image:: https://landscape.io/github/spotify/luigi/master/landscape.svg?style=flat + :target: https://landscape.io/github/spotify/luigi/master + +.. image:: https://img.shields.io/pypi/dm/luigi.svg?style=flat + :target: https://pypi.python.org/pypi/luigi + +.. image:: https://img.shields.io/pypi/l/luigi.svg?style=flat + :target: https://pypi.python.org/pypi/luigi + Luigi is a Python package that helps you build complex pipelines of batch jobs. It handles dependency resolution, workflow management, visualization, handling failures, command line integration, and much @@ -35,17 +46,11 @@ so that you can focus on the tasks themselves and their dependencies. You can build pretty much any task you want, but Luigi also comes with a *toolbox* of several common task templates that you use. It includes native Python support for running mapreduce jobs in Hadoop, as well as -Pig and Jar jobs. It also comes with filesystem abstractions for HDFS +Hive and Jar jobs. It also comes with filesystem abstractions for HDFS and local files that ensures all file system operations are atomic. This is important because it means your data pipeline will not crash in a state containing partial data. -Luigi was built at `Spotify `_, mainly by -`Erik Bernhardsson `_ and `Elias -Freider `_, but many other people have -contributed. - - Dependency graph example ------------------------ @@ -63,7 +68,7 @@ build up data files. Background ---------- -We use Luigi internally at `Spotify `_ to run +We use Luigi internally at `Spotify `_ to run thousands of tasks every day, organized in complex dependency graphs. Most of these tasks are Hadoop jobs. Luigi provides an infrastructure that powers all kinds of stuff including recommendations, toplists, A/B @@ -75,7 +80,7 @@ can help programmers focus on the most important bits and leave the rest Conceptually, Luigi is similar to `GNU Make `_ where you have certain tasks and these tasks in turn may have dependencies on other tasks. There are -also some similarities to `Oozie `_ +also some similarities to `Oozie `_ and `Azkaban `_. One major difference is that Luigi is not just built specifically for Hadoop, and it's easy to extend it with other kinds of tasks. @@ -100,12 +105,26 @@ if you want to run Hadoop jobs since it makes debugging easier. See Getting Started --------------- -The `Luigi package documentation `_ -contains an overview of how to work with Luigi, including an `Example workflow -`_ and an `API overview +Take a look at the `Example workflow +`_ and the `API overview `_ which explains some of the most important concepts. +Who uses Luigi? +--------------- + +Several companies have written blog posts or presentation about Luigi: + +* `Spotify : NYC Data Science `_ +* `Foursquare `_ +* `Mortar Data `_ +* `Stripe `_ +* `Asana `_ +* `Buffer `_ +* `SeatGeek `_ + +Please let us know if your company wants to be featured on this list! + Getting Help ------------ @@ -113,11 +132,19 @@ Getting Help * Subscribe to the `luigi-user `_ group and ask a question. - External links -------------- -* `Documentation `_ (Read the Docs) -* `Mailing List `_ (Google Groups) +* `Documentation `_, including the `Luigi package documentation `_ (Read the Docs) +* `Mailing List `_ (Google Groups) * `Releases `_ (PyPi) * `Source code `_ (Github) + +Authors +------- + +Luigi was built at `Spotify `_, mainly by +`Erik Bernhardsson `_ and `Elias +Freider `_, but many other people have +contributed. + diff --git a/bin/deps.py b/bin/deps.py index fc5445f870..939b55925b 100755 --- a/bin/deps.py +++ b/bin/deps.py @@ -37,12 +37,12 @@ # -from luigi.task import flatten import luigi.interface -from luigi.target import FileSystemTarget -from luigi.postgres import PostgresTarget from luigi.contrib.ssh import RemoteTarget +from luigi.postgres import PostgresTarget from luigi.s3 import S3Target +from luigi.target import FileSystemTarget +from luigi.task import flatten def get_task_requires(task): @@ -61,6 +61,7 @@ def dfs_paths(start_task, goal_task_name, path=None): class UpstreamArg(luigi.Task): + 'Used to provide the global parameter -- upstream' upstream = luigi.Parameter(is_global=True, default=None) @@ -93,14 +94,14 @@ def find_deps_cli(): task_name = d task_output = "n/a" if isinstance(d.output(), RemoteTarget): - task_output="[SSH] {0}:{1}".format(d.output()._fs.remote_context.host, d.output().path) + task_output = "[SSH] {0}:{1}".format(d.output()._fs.remote_context.host, d.output().path) elif isinstance(d.output(), S3Target): - task_output="[S3] {0}".format(d.output().path) - elif isinstance(d.output(),FileSystemTarget): - task_output="[FileSystem] {0}".format(d.output().path) - elif isinstance (d.output(), PostgresTarget): - task_output="[DB] {0}:{1}".format(d.output().host, d.output().table) + task_output = "[S3] {0}".format(d.output().path) + elif isinstance(d.output(), FileSystemTarget): + task_output = "[FileSystem] {0}".format(d.output().path) + elif isinstance(d.output(), PostgresTarget): + task_output = "[DB] {0}:{1}".format(d.output().host, d.output().table) else: - task_output= "to be determined" + task_output = "to be determined" print """ TASK: {0} : {1}""".format(task_name, task_output) diff --git a/bin/luigi-grep.py b/bin/luigi-grep.py index 625697af03..cbdacd8d21 100755 --- a/bin/luigi-grep.py +++ b/bin/luigi-grep.py @@ -1,11 +1,9 @@ #!/usr/bin/env python -from collections import defaultdict - import argparse import json import urllib2 - +from collections import defaultdict parser = argparse.ArgumentParser( "luigi-grep is used to search for workflows using the luigi scheduler's json api") @@ -18,6 +16,7 @@ class LuigiGrep(object): + def __init__(self, host, port): self._host = host self._port = port diff --git a/doc/README.md b/doc/README.md index ce45b4ec02..8d91d9d101 100644 --- a/doc/README.md +++ b/doc/README.md @@ -10,8 +10,7 @@ Sphinx uses ReStructuredText (RST) markup. There's a good describing the syntax. We also use the sphinx [autodoc](http://sphinx- doc.org/ext/autodoc.html) functionality to parse docstrings. For examples of cross-referencing modules/libraries/classes and for documentatingfunction/method -arguments, see docs on [the python domain](http://sphinx-doc.org/domains.html -#the-python-domain). +arguments, see docs on [the python domain](http://sphinx-doc.org/domains.html#the-python-domain). API Documentation ----------------- diff --git a/doc/api/luigi.contrib.rst b/doc/api/luigi.contrib.rst index af92020d31..a23500dcde 100644 --- a/doc/api/luigi.contrib.rst +++ b/doc/api/luigi.contrib.rst @@ -20,6 +20,14 @@ luigi.contrib.ftp module :undoc-members: :show-inheritance: +luigi.contrib.sqla module +------------------------- + +.. automodule:: luigi.contrib.sqla + :members: + :undoc-members: + :show-inheritance: + luigi.contrib.mysqldb module ---------------------------- @@ -52,6 +60,14 @@ luigi.contrib.spark module :undoc-members: :show-inheritance: +luigi.contrib.scalding module +----------------------------- + +.. automodule:: luigi.contrib.scalding + :members: + :undoc-members: + :show-inheritance: + luigi.contrib.sparkey module ---------------------------- diff --git a/doc/api/luigi.rst b/doc/api/luigi.rst index ca1cfde128..fbbdadb2a4 100644 --- a/doc/api/luigi.rst +++ b/doc/api/luigi.rst @@ -163,14 +163,6 @@ luigi.s3 module :undoc-members: :show-inheritance: -luigi.scalding module ---------------------- - -.. automodule:: luigi.scalding - :members: - :undoc-members: - :show-inheritance: - luigi.scheduler module ---------------------- diff --git a/doc/api/luigi.tools.rst b/doc/api/luigi.tools.rst new file mode 100644 index 0000000000..b34b10c63c --- /dev/null +++ b/doc/api/luigi.tools.rst @@ -0,0 +1,30 @@ +luigi.tools package +=================== + +Submodules +---------- + +luigi.tools.parse_task module +----------------------------- + +.. automodule:: luigi.tools.parse_task + :members: + :undoc-members: + :show-inheritance: + +luigi.tools.range module +------------------------ + +.. automodule:: luigi.tools.range + :members: + :undoc-members: + :show-inheritance: + + +Module contents +--------------- + +.. automodule:: luigi.tools + :members: + :undoc-members: + :show-inheritance: diff --git a/doc/api_overview.rst b/doc/api_overview.rst index c3daffed1a..5273cd40af 100644 --- a/doc/api_overview.rst +++ b/doc/api_overview.rst @@ -1,43 +1,44 @@ API Overview ------------ -There are two fundamental building blocks of Luigi - the *Task* class -and the *Target* class. Both are abstract classes and expect a few -methods to be implemented. In addition to those two concepts, the -*Parameter* class is an important concept that governs how a Task is -run. +There are two fundamental building blocks of Luigi - +the *Task* class and the *Target* class. +Both are abstract classes and expect a few methods to be implemented. +In addition to those two concepts, +the *Parameter* class is an important concept that governs how a Task is run. Target ~~~~~~ -Broadly speaking, the Target class corresponds to a file on a disk. Or a -file on HDFS. Or some kind of a checkpoint, like an entry in a database. +Broadly speaking, +the Target class corresponds to a file on a disk, +a file on HDFS or some kind of a checkpoint, like an entry in a database. Actually, the only method that Targets have to implement is the *exists* method which returns True if and only if the Target exists. -In practice, implementing Target subclasses is rarely needed. You can -probably get pretty far with the *LocalTarget* and *hdfs.HdfsTarget* -classes that are available out of the box. These directly map to a file -on the local drive, or a file in HDFS, respectively. In addition these -also wrap the underlying operations to make them atomic. They both -implement the *open(flag)* method which returns a stream object that -could be read (flag = 'r') from or written to (flag = 'w'). Both -LocalTarget and hdfs.HdfsTarget also optionally take a format parameter. -Luigi comes with Gzip support by providing *format=format.Gzip* . Adding -support for other formats is pretty simple. +In practice, implementing Target subclasses is rarely needed. +You can probably get pretty far with the *LocalTarget* and *hdfs.HdfsTarget* +classes that are available out of the box. +These directly map to a file on the local drive or a file in HDFS, respectively. +In addition these also wrap the underlying operations to make them atomic. +They both implement the *open(flag)* method which returns a stream object that +could be read (flag = 'r') from or written to (flag = 'w'). +Both LocalTarget and hdfs.HdfsTarget also optionally take a format parameter. +Luigi comes with Gzip support by providing *format=format.Gzip*. +Adding support for other formats is pretty simple. Task ~~~~ The *Task* class is a bit more conceptually interesting because this is -where computation is done. There are a few methods that can be -implemented to alter its behavior, most notably *run*, *output* and -*requires*. +where computation is done. +There are a few methods that can be implemented to alter its behavior, +most notably *run*, *output* and *requires*. The Task class corresponds to some type of job that is run, but in -general you want to allow some form of parametrization of it. For -instance, if your Task class runs a Hadoop job to create a report every -night, you probably want to make the date a parameter of the class. +general you want to allow some form of parametrization of it. +For instance, if your Task class runs a Hadoop job to create a report every night, +you probably want to make the date a parameter of the class. Parameter ^^^^^^^^^ @@ -53,12 +54,12 @@ Parameter objects on the class scope: # ... By doing this, Luigi can do take care of all the boilerplate code that -would normally be needed in the constructor. Internally, the DailyReport -object can now be constructed by running -*DailyReport(datetime.date(2012, 5, 10))* or just *DailyReport()*. Luigi -also creates a command line parser that automatically handles the -conversion from strings to Python types. This way you can invoke the job -on the command line eg. by passing *--date 2012-15-10*. +would normally be needed in the constructor. +Internally, the DailyReport object can now be constructed by running +*DailyReport(datetime.date(2012, 5, 10))* or just *DailyReport()*. +Luigi also creates a command line parser that automatically handles the +conversion from strings to Python types. +This way you can invoke the job on the command line eg. by passing *--date 2012-15-10*. The parameters are all set to their values on the Task object instance, i.e. @@ -68,13 +69,13 @@ i.e. d = DailyReport(datetime.date(2012, 5, 10)) print d.date -will return the same date that the object was constructed with. Same -goes if you invoke Luigi on the command line. +will return the same date that the object was constructed with. +Same goes if you invoke Luigi on the command line. Tasks are uniquely identified by their class name and values of their -parameters. In fact, within the same worker, two tasks of the same class -with parameters of the same values are not just equal, but the same -instance: +parameters. +In fact, within the same worker, two tasks of the same class with +parameters of the same values are not just equal, but the same instance: .. code:: python @@ -96,10 +97,10 @@ instance: >>> c is d True -However, if a parameter is created with *significant=False*, it is -ignored as far as the Task signature is concerned. Tasks created with -only insignificant parameters differing have the same signature, but are -not the same instance: +However, if a parameter is created with *significant=False*, +it is ignored as far as the Task signature is concerned. +Tasks created with only insignificant parameters differing have the same signature but +are not the same instance: .. code:: python @@ -122,38 +123,86 @@ not the same instance: True Python is not a typed language and you don't have to specify the types -of any of your parameters. You can simply use *luigi.Parameter* if you -don't care. In fact, the reason DateParameter et al exist is just in -order to support command line interaction and make sure to convert the -input to the corresponding type (i.e. datetime.date instead of a -string). +of any of your parameters. +You can simply use *luigi.Parameter* if you don't care. +In fact, the reason DateParameter et al exist is just in order to +support command line interaction and make sure to convert the input to +the corresponding type (i.e. datetime.date instead of a string). + +Setting parameter value for other classes +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All parameters are also exposed on a class level on the command line interface. +For instance, say you have classes TaskA and TaskB: + +.. code:: python + + class TaskA(luigi.Task): + x = luigi.Parameter() + + class TaskB(luigi.Task): + y = luigi.Parameter() + + +You can run *TaskB* on the command line: *python script.py TaskB --y 42*. +But you can also set the class value of *TaskA* by running *python script.py +TaskB --y 42 --TaskA-x 43*. +This sets the value of *TaskA.x* to 43 on a *class* level. +It is still possible to override it inside Python if you instantiate *TaskA(x=44)*. + +Parameters are resolved in the following order of decreasing priority: +1. Any value passed to the constructor, or task level value set on the command line +2. Any class level value set on the command line +3. Any configuration option (if using the *config_path* argument) +4. Any default value provided to the parameter Task.requires ^^^^^^^^^^^^^ -The *requires* method is used to specify dependencies on other Task -object, which might even be of the same class. For instance, an example -implementation could be +The *requires* method is used to specify dependencies on other Task object, +which might even be of the same class. +For instance, an example implementation could be .. code:: python def requires(self): return OtherTask(self.date), DailyReport(self.date - datetime.timedelta(1)) -In this case, the DailyReport task depends on two inputs created -earlier, one of which is the same class. requires can return other Tasks -in any way wrapped up within dicts/lists/tuples/etc. +In this case, the DailyReport task depends on two inputs created earlier, +one of which is the same class. +requires can return other Tasks in any way wrapped up within dicts/lists/tuples/etc. + +Requiring another Task +^^^^^^^^^^^^^^^^^^^^^^ + +Note that requires() can *not* return a Target object. +If you have a simple Target object that is created externally +you can wrap it in a Task class like this: + +.. code:: python + + class LogFiles(luigi.Task): + def output(self): + return luigi.hdfs.HdfsTarget('/log') + +This also makes it easier to add parameters: + +.. code:: python + + class LogFiles(luigi.Task): + date = luigi.DateParameter() + def output(self): + return luigi.hdfs.HdfsTarget(self.date.strftime('/log/%Y-%m-%d')) Task.output ^^^^^^^^^^^ -The *output* method returns one or more Target objects. Similarly to -requires, can return wrap them up in any way that's convenient for you. -However we recommend that any Task only return one single Target in -output. If multiple outputs are returned, atomicity will be lost unless -the Task itself can ensure that the Targets are atomically created. (If -atomicity is not of concern, then it is safe to return multiple Target -objects.) +The *output* method returns one or more Target objects. +Similarly to requires, can return wrap them up in any way that's convenient for you. +However we recommend that any Task only return one single Target in output. +If multiple outputs are returned, +atomicity will be lost unless the Task itself can ensure that the Targets are atomically created. +(If atomicity is not of concern, then it is safe to return multiple Target objects.) .. code:: python @@ -166,12 +215,13 @@ objects.) Task.run ^^^^^^^^ -The *run* method now contains the actual code that is run. Note that -Luigi breaks down everything into two stages. First it figures out all -dependencies between tasks, then it runs everything. The *input()* -method is an internal helper method that just replaces all Task objects -in requires with their corresponding output. For instance, in this -example +The *run* method now contains the actual code that is run. +When you are using *requires()* and *run()*, Luigi breaks down everything into two stages. +First it figures out all dependencies between tasks, +then it runs everything. +The *input()* method is an internal helper method that just replaces all Task objects in requires +with their corresponding output. +An example: .. code:: python @@ -193,26 +243,58 @@ example g.write('%s\n', ''.join(reversed(line.strip().split())) g.close() # needed because files are atomic + +Dynamic dependencies +^^^^^^^^^^^^^^^^^^^^ + +Sometimes you might not now exactly what other tasks to depend on until runtime. +In that case, Luigi provides a mechanism to specify dynamic dependencies. +If you yield another Task in the run() method, +the current task will be suspended and the other task will be run. +You can also return a list of tasks. + +.. code:: python + + class MyTask(luigi.Task): + def run(self): + other_target = yield OtherTask() + + # dynamic dependencies resolve into targets + f = other_target.open('r') + + +This mechanism is an alternative to *requires()* in case +you are not able to build up the full dependency graph before running the task. +It does come with some constraints: +the run() method will resume from scratch each time a new task is yielded. +In other words, you should make sure your run() method is idempotent. +(This is good practice for all Tasks in Luigi, but especially so for tasks with dynamic dependencies). + +For an example of a workflow using dynamic dependencies, see +`examples/dynamic_requirements.py `_. + + Events and callbacks ^^^^^^^^^^^^^^^^^^^^ -Luigi has a built-in event system that allows you to register callbacks -to events and trigger them from your own tasks. You can both hook into -some pre-defined events and create your own. Each event handle is tied -to a Task class, and will be triggered only from that class or a -subclass of it. This allows you to effortlessly subscribe to events only -from a specific class (e.g. for hadoop jobs). +Luigi has a built-in event system that +allows you to register callbacks to events and trigger them from your own tasks. +You can both hook into some pre-defined events and create your own. +Each event handle is tied to a Task class and +will be triggered only from that class or +a subclass of it. +This allows you to effortlessly subscribe to events only from a specific class (e.g. for hadoop jobs). .. code:: python - @luigi.Task.event_handler(luigi.Event.SUCCESS): + @luigi.Task.event_handler(luigi.Event.SUCCESS) def celebrate_success(task): """Will be called directly after a successful execution of `run` on any Task subclass (i.e. all luigi Tasks) """ ... - @luigi.hadoop.JobTask.event_handler(luigi.Event.FAILURE): + @luigi.hadoop.JobTask.event_handler(luigi.Event.FAILURE) def mourn_failure(task, exception): """Will be called directly after a failed execution of `run` on any JobTask subclass @@ -225,11 +307,11 @@ from a specific class (e.g. for hadoop jobs). But I just want to run a Hadoop job? ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The Hadoop code is integrated in the rest of the Luigi code because we -really believe almost all Hadoop jobs benefit from being part of some -sort of workflow. However, in theory, nothing stops you from using the -hadoop.JobTask class (and also hdfs.HdfsTarget) without using the rest -of Luigi. You can simply run it manually using +The Hadoop code is integrated in the rest of the Luigi code because +we really believe almost all Hadoop jobs benefit from being part of some sort of workflow. +However, in theory, nothing stops you from using the hadoop.JobTask class (and also hdfs.HdfsTarget) +without using the rest of Luigi. +You can simply run it manually using .. code:: python @@ -248,17 +330,18 @@ You can use the hdfs.HdfsTarget class anywhere by just instantiating it: Task priority ^^^^^^^^^^^^^ -The scheduler decides which task to run next from the set of all task -that have all their dependencies met. By default, this choice is pretty -arbitrary, which is fine for most workflows and situations. +The scheduler decides which task to run next from +the set of all task that have all their dependencies met. +By default, this choice is pretty arbitrary, +which is fine for most workflows and situations. -If you want to have some control on the order of execution -of available tasks, you can set the *priority* property of a task, +If you want to have some control on the order of execution of available tasks, +you can set the *priority* property of a task, for example as follows: .. code:: python - # A static priority value as a class contant: + # A static priority value as a class constant: class MyTask(luigi.Task): priority = 100 # ... @@ -273,24 +356,26 @@ for example as follows: return 40 # ... -Tasks with a higher priority value will be picked before tasks -with a lower priority value. -There is no predefined range of priorities, you can choose whatever -(int or float) values you want to use. The default value is 0. -Note that it is perfectly valid to choose negative priorities for -tasks that should have less priority than default. - -Warning: task execution order in Luigi is influenced by both dependencies -and priorities, but in Luigi dependencies come first. For example: -if there is a task A with priority 1000 but still with unmet dependencies -and a task B with priority 1 without any pending dependencies, +Tasks with a higher priority value will be picked before tasks with a lower priority value. +There is no predefined range of priorities, +you can choose whatever (int or float) values you want to use. +The default value is 0. +Note that it is perfectly valid to choose negative priorities +for tasks that should have less priority than default. + +Warning: task execution order in Luigi is influenced by both dependencies and priorities, but +in Luigi dependencies come first. +For example: +if there is a task A with priority 1000 but still with unmet dependencies and +a task B with priority 1 without any pending dependencies, task B will be picked first. Instance caching ^^^^^^^^^^^^^^^^ -In addition to the stuff mentioned above, Luigi also does some metaclass -logic so that if eg. *DailyReport(datetime.date(2012, 5, 10))* is -instantiated twice in the code, it will in fact result in the same -object. This is needed so that each Task is run only once. \ No newline at end of file +In addition to the stuff mentioned above, +Luigi also does some metaclass logic so that +if e.g. *DailyReport(datetime.date(2012, 5, 10))* is instantiated twice in the code, +it will in fact result in the same object. +This is needed so that each Task is run only once. diff --git a/doc/central_scheduler.rst b/doc/central_scheduler.rst index 0a757fe7d9..a745497f9a 100644 --- a/doc/central_scheduler.rst +++ b/doc/central_scheduler.rst @@ -1,15 +1,89 @@ Using the Central Scheduler -^^^^^^^^^^^^^^^^^^^^^^^^^^^ +--------------------------- -The central scheduler does not execute anything for you, or help you -with job parallelization. The two purposes it serves are to +While the ``--local-scheduler`` flag is useful for development purposes, +it's not recommended for production usage. +The centralized scheduler services two purposes: -- Make sure two instances of the same task are not running - simultaneously +- Make sure two instances of the same task are not running simultaneously - Provide visualization of everything that's going on. -For running tasks periodically, the easiest thing to do is to trigger a -Python script from cron or from a continuously running process. There is -no central process that automatically triggers job. This model may seem -limited, but we believe that it makes things far more intuitive and easy -to understand. \ No newline at end of file +Note that the central scheduler does not execute anything for you or +help you with job parallelization. +For running tasks periodically, +the easiest thing to do is to trigger a Python script from cron or +from a continuously running process. +There is no central process that automatically triggers job. +This model may seem limited, but +we believe that it makes things far more intuitive and easy to understand. + +The luigid server +~~~~~~~~~~~~~~~~~ + +To run the server as a daemon run: + +:: + + PYTHONPATH=. python bin/luigid --background --pidfile --logdir --state-path + +Note that this requires ``python-daemon``. +By default, the server starts on port ``8082`` +(which can be changed with the ``--port`` flag) and listens on all IPs. + +For a full list of configuration options and defaults, +see the :ref:`scheduler configuration section `. +Note that ``luigid`` uses the same configuration files as the luigi client +(i.e. ``client.cfg`` or ``/etc/luigi/client.cfg`` by default). + +Enabling Task History +~~~~~~~~~~~~~~~~~~~~~ + +Task History is an experimental feature in which +additional information about tasks that have been executed are recorded in a relational database +for historical analysis. +This information is exposed via the Central Scheduler at ``/history``. + +To enable the task history, +specify ``record_task_history = True`` in the +``[scheduler]`` section of ``client.cfg`` and +specify ``db_connection`` under ``[task_history]``. +The ``db_connection`` string is to used to configure the `SQLAlchemy engine +`_. +When starting up, +``luigid`` will create all the necessary tables using `create_all +`_. + +Example configuration:: + + [scheduler] + record_task_history = True + state-path = /usr/local/var/luigi-state.pickle + + [task_history] + db_connection = sqlite:////usr/local/var/luigi-task-hist.db + +The task history has the following pages: + +* ``/history`` + a reverse-cronological listing of runs from the past 24 hours. + Example screenshot: + + .. figure:: history.png + :alt: Recent history screenshot +* ``/history/by_id/:id`` + detailed information about a run, including: + parameter values, the host on which it ran, and timing information. + Example screenshot: + + .. figure:: history_by_id.png + :alt: By id screenshot +* ``/history/by_name/:name`` + a listing of all runs of a task with the given task name. + Example screenshot: + + .. figure:: history_by_name.png + :alt: By name screenshot +* ``/history/by_params/:name?data=params`` + a listing of all runs of a given task restricted to runs with param values matching the given data. + The data is a json blob describing the parameters, + e.g. ``{"foo": "bar"}`` looks for a task with ``foo=bar``. diff --git a/doc/command_line.rst b/doc/command_line.rst index 3e1ea64973..a640f8a739 100644 --- a/doc/command_line.rst +++ b/doc/command_line.rst @@ -21,17 +21,18 @@ Any task can be instantiated and run from the command line: You can run this task from the command line like this:: - $ python my_task.py MyTask --x 123 --y 456 + $ python my_task.py MyTask --local-scheduler --x 123 --y 456 -You can also pass ``main_task_cls=MyTask`` to ``luigi.run()`` and that way +You can also pass ``main_task_cls=MyTask`` and ``local_scheduler=True`` to ``luigi.run()`` and that way you can invoke it simply using :: $ python my_task.py --x 123 --y 456 -The other way to run a Luigi task is to use the builtin *luigi* task. This will -be default on your path and can be run by providing a module name. The module -will imported dynamically:: +The other way to run a Luigi task is to use the builtin *luigi* task. +This will be default on your path and +can be run by providing a module name. +The module will imported dynamically:: $ luigi --module my_module MyTask --x 123 --y 456 diff --git a/doc/conf.py b/doc/conf.py index 1f30c58a08..9100ab455e 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -74,7 +74,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. -exclude_patterns = ['_build'] +exclude_patterns = ['_build', 'README.rst'] # The reST default role (used for this markup: `text`) to use for all # documents. @@ -149,7 +149,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +#html_static_path = ['_static'] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied diff --git a/doc/configuration.rst b/doc/configuration.rst index 2e7e473a35..37999ac2d9 100644 --- a/doc/configuration.rst +++ b/doc/configuration.rst @@ -17,7 +17,32 @@ each controlling a different part of the config. Example default-scheduler-host: luigi-host.mycompany.foo error-email: foo@bar.baz -Below, we describe each section and the parameters available within it. +By default, all parameters will be overridden by matching values in the +configuration file. For instance if you have a Task definition: + +.. code:: python + + class DailyReport(luigi.hadoop.JobTask): + date = luigi.DateParameter(default=datetime.date.today()) + # ... + +Then you can override the default value for date by providing it in the +configuration: + +:: + + [DailyReport] + date: 2012-01-01 + +You can also use ``config_path`` as an argument to the ``Parameter`` if +you want to use a specific section in the config. + + +Configurable options +==================== + +Luigi comes with a lot of configurable options. Below, we describe each +section and the parameters available within it. [core] @@ -71,6 +96,14 @@ max-reschedules reschedule a job if it is found to not be done when attempting to run a dependent job. This defaults to 1. +max-shown-tasks + .. versionadded:: 1.0.20 + + The maximum number of tasks returned in a task_list api call. This + will restrict the number of tasks shown in any section in the + visualiser. Small values can alleviate frozen browsers when there are + too many done tasks. This defaults to 100000 (one hundred thousand). + no_configure_logging If true, logging is not configured. Defaults to false. @@ -120,12 +153,22 @@ worker-count-uniques worker-keep-alive If true, workers will stay alive when they run out of jobs to run, as long as they have some pending job waiting to be run. Defaults to - true. + false. worker-ping-interval Number of seconds to wait between pinging scheduler to let it know that the worker is still alive. Defaults to 1.0. +worker-timeout + .. versionadded:: 1.0.20 + + Number of seconds after which to kill a task which has been running + for too long. This provides a default value for all tasks, which can + be overridden by setting the worker-timeout property in any task. This + only works when using multiple workers, as the timeout is implemented + by killing worker subprocesses. Default value is 0, meaning no + timeout. + worker-wait-interval Number of seconds for the worker to wait before asking the scheduler for another job after the scheduler has said that it does not have any @@ -202,6 +245,9 @@ client client_version Optionally specifies hadoop client version for snakebite. +effective_user + Optionally specifies the effective user for snakebite. + namenode_host The hostname of the namenode. Needed for snakebite if snakebite_autoconfig is not set. @@ -318,6 +364,8 @@ scalding-libjars SCALDING_HOME/libjars or /usr/share/scalding/libjars +.. _scheduler-config: + [scheduler] ----------- diff --git a/doc/contributing.rst b/doc/contributing.rst index 84fec1a2a2..a8650519da 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -7,14 +7,21 @@ where x = luigi and y = spotify. Running Unit Tests ~~~~~~~~~~~~~~~~~~ -1. Install required packages: ``pip install -r test/requirements.txt`` -2. From the top directory, run - `Nose `__: ``nosetests`` - - - To run all tests within individual files: - ``nosetests test/parameter_test.py test/fib_test.py ...`` - - To run named tests within individual files: - ``nosetests -m '(testDate.*|testInt)' test/parameter_test.py ...`` +You can see in ``.travis.yml`` how Travis CI runs the tests. Essentially, what +you do is first ``pip install tox``, then you can run any of these examples and +change them to your needs. + + +.. code-block:: bash + + # Run all nonhdfs tests + export TOX_ENV=nonhdfs; export PYTHONPATH=''; tox -e $TOX_ENV test + + # Run specific nonhdfs tests + export TOX_ENV=nonhdfs; export PYTHONPATH=''; tox -e $TOX_ENV test/test_ssh.py + + # Run specific hdp tests with hdp hadoop distrubtion + export TOX_ENV=hdp; export PYTHONPATH=''; JAVA_HOME=/usr/lib/jvm/java-1.7.0-openjdk-amd64 tox -e $TOX_ENV test/snakebite_test.py Future Ideas ~~~~~~~~~~~~ diff --git a/doc/example_top_artists.rst b/doc/example_top_artists.rst index 65420df537..446ba39058 100644 --- a/doc/example_top_artists.rst +++ b/doc/example_top_artists.rst @@ -1,14 +1,15 @@ Example Workflow – Top Artists ------------------------------ -This is a very simplified case of something we do at Spotify a lot. All -user actions are logged to HDFS where we run a bunch of Hadoop jobs to -transform the data. At some point we might end up with a smaller data -set that we can bulk ingest into Cassandra, Postgres, or some other -format. +This is a very simplified case of something we do at Spotify a lot. +All user actions are logged to HDFS where +we run a bunch of Hadoop jobs to transform the data. +At some point we might end up with +a smaller data set that we can bulk ingest into Cassandra, Postgres, or +some other format. -For the purpose of this exercise, we want to aggregate all streams, and -find the top 10 artists. We will then put it into Postgres. +For the purpose of this exercise, we want to aggregate all streams, +find the top 10 artists and then put the results into Postgres. This example is also available in ``examples/top_artists.py`` @@ -40,10 +41,10 @@ Step 1 - Aggregate Artist Streams print >> out_file, artist, count Note that this is just a portion of the file *examples/top\_artists.py*. -In particular, ``Streams`` is defined as a ``luigi.Task``, acting as a -dependency for ``AggregateArtists``. In addition, ``luigi.run()`` is -called if the script is executed directly, allowing it to be run from -the command line. +In particular, ``Streams`` is defined as a ``luigi.Task``, +acting as a dependency for ``AggregateArtists``. +In addition, ``luigi.run()`` is called if the script is executed directly, +allowing it to be run from the command line. There are several pieces of this snippet that deserve more explanation. @@ -96,10 +97,12 @@ overview of the options: AggregateArtists.date_interval Running the command again will do nothing because the output file is -already created. In that sense, any task in Luigi is *idempotent* +already created. +In that sense, any task in Luigi is *idempotent* because running it many times gives the same outcome as running it once. Note that unlike Makefile, the output will not be recreated when any of -the input files is modified. You need to delete the output file +the input files is modified. +You need to delete the output file manually. The *--local-scheduler* flag tells Luigi not to connect to a scheduler @@ -137,18 +140,19 @@ Note that ``luigi.hadoop.JobTask`` doesn't require you to implement a Step 2 – Find the Top Artists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -At this point, we've counted the number of streams for each artists, for -the full time period. We are left with a large file that contains -mappings of artist -> count data, and we want to find the top 10 -artists. Since we only have a few hundred thousand artists, and -calculating artists is nontrivial to parallelize, we choose to do this -not as a Hadoop job, but just as a plain old for-loop in Python. +At this point, we've counted the number of streams for each artists, +for the full time period. +We are left with a large file that contains +mappings of artist -> count data, and we want to find the top 10 artists. +Since we only have a few hundred thousand artists, and +calculating artists is nontrivial to parallelize, +we choose to do this not as a Hadoop job, but just as a plain old for-loop in Python. .. code:: python class Top10Artists(luigi.Task): date_interval = luigi.DateIntervalParameter() - use_hadoop = luigi.BooleanParameter() + use_hadoop = luigi.BoolParameter() def requires(self): if self.use_hadoop: @@ -172,9 +176,9 @@ not as a Hadoop job, but just as a plain old for-loop in Python. yield int(streams), int(artist) The most interesting thing here is that this task (*Top10Artists*) -defines a dependency on the previous task (*AggregateArtists*). This -means that if the output of *AggregateArtists* does not exist, the task -will run before *Top10Artists*. +defines a dependency on the previous task (*AggregateArtists*). +This means that if the output of *AggregateArtists* does not exist, +the task will run before *Top10Artists*. :: @@ -186,15 +190,15 @@ Step 3 - Insert into Postgres ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ This mainly serves as an example of a specific subclass *Task* that -doesn't require any code to be written. It's also an example of how you -can define task templates that you can reuse for a lot of different -tasks. +doesn't require any code to be written. +It's also an example of how you can define task templates that +you can reuse for a lot of different tasks. .. code:: python class ArtistToplistToDatabase(luigi.postgres.CopyToTable): date_interval = luigi.DateIntervalParameter() - use_hadoop = luigi.BooleanParameter() + use_hadoop = luigi.BoolParameter() host = "localhost" database = "toplists" @@ -217,17 +221,18 @@ building all its upstream dependencies. Using the Central Planner ~~~~~~~~~~~~~~~~~~~~~~~~~ -The *--local-scheduler* flag tells Luigi not to connect to a central -scheduler. This is recommended in order to get started and or for -development purposes. At the point where you start putting things in -production we strongly recommend running the central scheduler server. -In addition to providing locking so the same task is not run by multiple -processes at the same time, this server also provides a pretty nice -visualization of your current work flow. +The *--local-scheduler* flag tells Luigi not to connect to a central scheduler. +This is recommended in order to get started and or for development purposes. +At the point where you start putting things in production +we strongly recommend running the central scheduler server. +In addition to providing locking +so that the same task is not run by multiple processes at the same time, +this server also provides a pretty nice visualization of your current work flow. -If you drop the *--local-scheduler* flag, your script will try to -connect to the central planner, by default at localhost port 8082. If -you run +If you drop the *--local-scheduler* flag, +your script will try to connect to the central planner, +by default at localhost port 8082. +If you run :: @@ -240,31 +245,26 @@ in the background and then run $ python wordcount.py --date 2012-W03 then in fact your script will now do the scheduling through a -centralized server. You need `Tornado `__ -for this to work. +centralized server. +You need `Tornado `__ for this to work. Launching *http://localhost:8082* should show something like this: .. figure:: web_server.png :alt: Web server screenshot - Web server screenshot -Looking at the dependency graph for any of the tasks yields something -like this: +Web server screenshot +Looking at the dependency graph +for any of the tasks yields something like this: .. figure:: aggregate_artists.png :alt: Aggregate artists screenshot - Aggregate artists screenshot -In case your job crashes remotely due to any Python exception, Luigi -will try to fetch the traceback and print it on standard output. You -need `Mechanize `__ for it +Aggregate artists screenshot +In case your job crashes remotely due to any Python exception, +Luigi will try to fetch the traceback and print it on standard output. +You need `Mechanize `__ for it to work and you also need connectivity to your tasktrackers. -To run the server as a daemon run: - -:: - - PYTHONPATH=. python bin/luigid --background --pidfile --logdir --state-path - -Note that this requires python-daemon for this to work. +In production, you'll want to run the centralized scheduler. +See: :doc:`central_scheduler` for more information. diff --git a/doc/execution_model.rst b/doc/execution_model.rst index 542aec3093..11ff9147f9 100644 --- a/doc/execution_model.rst +++ b/doc/execution_model.rst @@ -1,34 +1,38 @@ Execution Model --------------- -Luigi has a quite simple model. The most important aspect is that *no -execution is transferred*. When you run a Luigi workflow, the worker -schedules all tasks, and also executes the tasks within the process. +Luigi has a quite simple model. +The most important aspect is that *no execution is transferred*. +When you run a Luigi workflow, +the worker schedules all tasks, and +also executes the tasks within the process. -The benefit of this scheme is that it's super easy to debug since all -execution takes place in the process. It also makes deployment a -non-event. During development, you typically run the Luigi workflow from -the command line, whereas when you deploy it, you can trigger it using -crontab or any other scheduler. +The benefit of this scheme is that +it's super easy to debug since all execution takes place in the process. +It also makes deployment a non-event. +During development, +you typically run the Luigi workflow from the command line, +whereas when you deploy it, +you can trigger it using crontab or any other scheduler. -The downside is that Luigi doesn't give you scalability for free, but we -think that should really be up to each Task to implement rather than -relying on Luigi as a scalability engine. Another downside is that you -have to rely on an external scheduler such as crontab to actually -trigger the workflows. +The downside is that Luigi doesn't give you scalability for free, but +we think that should really be up to each Task to implement rather than +relying on Luigi as a scalability engine. +Another downside is that you have to rely on an external scheduler +such as crontab to actually trigger the workflows. -Isn't the point of Luigi to automate and schedule these workflows? Not -necessarily. Luigi helps you *encode the dependencies* of tasks and -build up chains. Furthermore, Luigi's scheduler makes sure that there's -centralized view of the dependency graph and that the same job will not -be executed by multiple workers simultaneously. +Isn't the point of Luigi to automate and schedule these workflows? +Not necessarily. +Luigi helps you *encode the dependencies* of tasks and +build up chains. +Furthermore, Luigi's scheduler makes sure that there's centralized view of the dependency graph and +that the same job will not be executed by multiple workers simultaneously. -This means that scheduling a complex workflow is fairly trivial using -eg. crontab. If you have an external data dump that arrives every day -and that your workflow depends on it, you write a workflow that depends -on this data dump. Crontab can then trigger this workflow *every minute* -to check if the data has arrived. If it has, it will run the full -dependency graph. +This means that scheduling a complex workflow is fairly trivial using eg. crontab. +If you have an external data dump that arrives every day and that your workflow depends on it, +you write a workflow that depends on this data dump. +Crontab can then trigger this workflow *every minute* to check if the data has arrived. +If it has, it will run the full dependency graph. .. code:: python @@ -53,13 +57,14 @@ dependency graph. if __name__ == '__main__': luigi.run(main_task_cls=RunAll) -You can trigger this as much as you want from crontab, and even across -multiple machines, because the central scheduler will make sure at most -one of each ``AggregationTask`` task is run simultaneously. Note that -this might actually mean multiple tasks can be run because there are -instances with different parameters, and this can gives you some form of -parallelization (eg. ``AggregationTask(2013-01-09)`` might run in -parallel with ``AggregationTask(2013-01-08)``). +You can trigger this as much as you want from crontab, and +even across multiple machines, because +the central scheduler will make sure at most one of each ``AggregationTask`` task is run simultaneously. +Note that this might actually mean multiple tasks can be run because +there are instances with different parameters, and +this can gives you some form of parallelization +(eg. ``AggregationTask(2013-01-09)`` might run in parallel with ``AggregationTask(2013-01-08)``). -Of course, some Task types (eg. ``HadoopJobTask``) can transfer -execution to other places, but this is up to each Task to define. +Of course, +some Task types (eg. ``HadoopJobTask``) can transfer execution to other places, but +this is up to each Task to define. diff --git a/doc/history.png b/doc/history.png new file mode 100644 index 0000000000..e8173fcf85 Binary files /dev/null and b/doc/history.png differ diff --git a/doc/history_by_id.png b/doc/history_by_id.png new file mode 100644 index 0000000000..97a90f1cc5 Binary files /dev/null and b/doc/history_by_id.png differ diff --git a/doc/history_by_name.png b/doc/history_by_name.png new file mode 100644 index 0000000000..5bef1291a4 Binary files /dev/null and b/doc/history_by_name.png differ diff --git a/doc/index.rst b/doc/index.rst index a540eb5d31..aa4aa95ecc 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -3,7 +3,7 @@ You can adapt this file completely to your liking, but it should at least contain the root `toctree` directive. -.. include:: ../README.rst +.. include:: README.rst Table of Contents ----------------- @@ -30,6 +30,7 @@ API Reference luigi luigi.contrib + luigi.tools Indices and tables diff --git a/doc/luigi_patterns.rst b/doc/luigi_patterns.rst index d547e753bc..04ec0770fd 100644 --- a/doc/luigi_patterns.rst +++ b/doc/luigi_patterns.rst @@ -4,27 +4,26 @@ Luigi Patterns Code Reuse ~~~~~~~~~~ -One nice thing about Luigi is that it's super easy to depend on tasks -defined in other repos. It's also trivial to have "forks" in the -execution path, where the output of one task may become the input of -many other tasks. - -Currently no semantics for "intermediate" output is supported, meaning -that all output will be persisted indefinitely. The upside of that is -that if you try to run X -> Y, and Y crashes, you can resume with the -previously built X. The downside is that you will have a lot of -intermediate results on your file system. A useful pattern is to put -these files in a special directory and have some kind of periodical -garbage collection clean it up. +One nice thing about Luigi is that it's super easy to depend on tasks defined in other repos. +It's also trivial to have "forks" in the execution path, +where the output of one task may become the input of many other tasks. + +Currently no semantics for "intermediate" output is supported, +meaning that all output will be persisted indefinitely. +The upside of that is that if you try to run X -> Y, and Y crashes, +you can resume with the previously built X. +The downside is that you will have a lot of intermediate results on your file system. +A useful pattern is to put these files in a special directory and +have some kind of periodical garbage collection clean it up. Triggering Many Tasks ~~~~~~~~~~~~~~~~~~~~~ -A common use case is to make sure some daily Hadoop job (or something -else) is run every night. Sometimes for various reasons things will -crash for more than a day though. A useful pattern is to have a dummy -Task at the end just declaring dependencies on the past few days of -tasks you want to run. +A common use case is to make sure some daily Hadoop job +(or something else) is run every night. +Sometimes for various reasons things will crash for more than a day though. +A useful pattern is to have a dummy Task at the end +just declaring dependencies on the past few days of tasks you want to run. .. code:: python @@ -36,5 +35,5 @@ tasks you want to run. date = self.date - datetime.timedelta(i + 1) yield SomeReport(date), SomeOtherReport(date), CropReport(date), TPSReport(date), FooBarBazReport(date) -This simple task will not do anything itself, but will invoke a bunch of -other tasks. \ No newline at end of file +This simple task will not do anything itself, but +will invoke a bunch of other tasks. \ No newline at end of file diff --git a/doc/more_info.rst b/doc/more_info.rst index 84341776ca..fad28a063a 100644 --- a/doc/more_info.rst +++ b/doc/more_info.rst @@ -2,49 +2,41 @@ More Info --------- -Luigi is the successor to a couple of attempts that we weren't fully -happy with. We learned a lot from our mistakes and some design decisions -include: +Luigi is the successor to a couple of attempts that we weren't fully happy with. +We learned a lot from our mistakes and some design decisions include: - Straightforward command line integration. - As little boilerplate as possible. -- Focus on job scheduling and dependency resolution, not a particular - platform. In particular this means no limitation to Hadoop. Though - Hadoop/HDFS support is built-in and is easy to use, this is just one - of many types of things you can run. -- A file system abstraction where code doesn't have to care about where - files are located. -- Atomic file system operations through this abstraction. If a task - crashes it won't lead to a broken state. -- The dependencies are decentralized. No big config file in XML. Each - task just specifies which inputs it needs and cross-module - dependencies are trivial. -- A web server that renders the dependency graph and does locking etc - for free. +- Focus on job scheduling and dependency resolution, not a particular platform. + In particular this means no limitation to Hadoop. + Though Hadoop/HDFS support is built-in and is easy to use, + this is just one of many types of things you can run. +- A file system abstraction where code doesn't have to care about where files are located. +- Atomic file system operations through this abstraction. + If a task crashes it won't lead to a broken state. +- The dependencies are decentralized. + No big config file in XML. + Each task just specifies which inputs it needs and cross-module dependencies are trivial. +- A web server that renders the dependency graph and does locking etc for free. - Trivial to extend with new file systems, file formats and job types. - You can easily write jobs that inserts a Tokyo Cabinet into - Cassandra. Adding broad support S3, MySQL or Hive should be a stroll - in the park. (Feel free to send us a patch when you're done!) + You can easily write jobs that inserts a Tokyo Cabinet into Cassandra. + Adding broad support S3, MySQL or Hive should be a stroll in the park. + (Feel free to send us a patch when you're done!) - Date algebra included. - Lots of unit tests of the most basic stuff -It wouldn't be fair not to mention some limitations with the current -design: +It wouldn't be fair not to mention some limitations with the current design: -- Its focus is on batch processing so it's probably less useful for - near real-time pipelines or continuously running processes. -- The assumption is that a each task is a sizable chunk of work. While - you can probably schedule a few thousand jobs, it's not meant to - scale beyond tens of thousands. -- Luigi maintains a strict separation between scheduling tasks and - running them. Dynamic for-loops and branches are non-trivial to - implement. For instance, it's tricky to iterate a numerical - computation task until it converges. +- Its focus is on batch processing so + it's probably less useful for near real-time pipelines or continuously running processes. +- The assumption is that a each task is a sizable chunk of work. + While you can probably schedule a few thousand jobs, + it's not meant to scale beyond tens of thousands. +- Luigi maintains a strict separation between scheduling tasks and running them. + Dynamic for-loops and branches are non-trivial to implement. + For instance, it's tricky to iterate a numerical computation task until it converges. -It should actually be noted that all these limitations are not -fundamental in any way. However, it would take some major refactoring -work. - -Also it should be mentioned that Luigi is named after the world's second -most famous plumber. +It should actually be noted that all these limitations are not fundamental in any way. +However, it would take some major refactoring work. +Also it should be mentioned that Luigi is named after the world's second most famous plumber. diff --git a/doc/programmatic_execution.rst b/doc/programmatic_execution.rst index 5c72e06b12..86894e9720 100644 --- a/doc/programmatic_execution.rst +++ b/doc/programmatic_execution.rst @@ -11,9 +11,9 @@ As seen above, command line integration is achieved by simply adding This will read the args from the command line (using argparse) and invoke everything. -In case you just want to run a Luigi chain from a Python script, you can -do that internally without the command line integration. The code will -look something like +In case you just want to run a Luigi chain from a Python script, +you can do that internally without the command line integration. +The code will look something like .. code:: python diff --git a/examples/dynamic_requirements.py b/examples/dynamic_requirements.py index f043910a68..6b56b74c17 100644 --- a/examples/dynamic_requirements.py +++ b/examples/dynamic_requirements.py @@ -1,7 +1,8 @@ -import luigi import random as rnd import time +import luigi + class Config(luigi.Task): seed = luigi.IntParameter() diff --git a/examples/elasticsearch_index.py b/examples/elasticsearch_index.py index 5bbcb21be1..264f26a155 100644 --- a/examples/elasticsearch_index.py +++ b/examples/elasticsearch_index.py @@ -1,11 +1,14 @@ # coding: utf-8 -from luigi.contrib.esindex import CopyToIndex import datetime import json + import luigi +from luigi.contrib.esindex import CopyToIndex + class FakeDocuments(luigi.Task): + """ Generate some documents to index. """ date = luigi.DateParameter(default=datetime.date.today()) @@ -23,7 +26,9 @@ def run(self): def output(self): return luigi.LocalTarget(path='/tmp/_docs-%s.ldj' % self.date) + class IndexDocuments(CopyToIndex): + """ Run diff --git a/examples/foo.py b/examples/foo.py index 2ddd5d49a1..fb51670727 100644 --- a/examples/foo.py +++ b/examples/foo.py @@ -1,20 +1,23 @@ -import luigi -import time import os import shutil +import time + +import luigi class MyExternal(luigi.ExternalTask): + def complete(self): return False class Foo(luigi.Task): + def run(self): print "Running Foo" def requires(self): -# yield MyExternal() + # yield MyExternal() for i in xrange(10): yield Bar(i) diff --git a/examples/ftp_experiment_outputs.py b/examples/ftp_experiment_outputs.py index c485cd9041..b80199dc10 100644 --- a/examples/ftp_experiment_outputs.py +++ b/examples/ftp_experiment_outputs.py @@ -8,9 +8,11 @@ class ExperimentTask(luigi.ExternalTask): + ''' This class represents something that was created elsewhere by an external process, so all we want to do is to implement the output method. ''' + def output(self): return RemoteTarget('/experiment/output1.txt', HOST, username=USER, password=PWD) @@ -23,9 +25,11 @@ def run(self): class ProcessingTask(luigi.Task): + ''' This class represents something that was created elsewhere by an external process, so all we want to do is to implement the output method. ''' + def requires(self): return ExperimentTask() diff --git a/examples/spark_als.py b/examples/spark_als.py index 1e66ba0a00..ca4c942a7e 100644 --- a/examples/spark_als.py +++ b/examples/spark_als.py @@ -48,7 +48,6 @@ def output(self): '%s/als-output/*' % self.item_type, format=luigi.format.Gzip) - ''' // Corresponding example Spark Job, a wrapper around the MLLib ALS job. // This class would have to be jarred into my-spark-assembly.jar @@ -85,4 +84,4 @@ def main(args: Array[String]) { sc.stop() } } -''' \ No newline at end of file +''' diff --git a/examples/ssh_remote_execution.py b/examples/ssh_remote_execution.py index 5db3c5ca95..4dd5011a3f 100644 --- a/examples/ssh_remote_execution.py +++ b/examples/ssh_remote_execution.py @@ -1,4 +1,5 @@ from collections import defaultdict + import luigi from luigi.contrib.ssh import RemoteContext, RemoteTarget from luigi.mock import MockFile @@ -7,9 +8,11 @@ class CreateRemoteData(luigi.Task): + """ Dump info on running processes on remote host. Data is still stored on the remote host """ + def output(self): return RemoteTarget( "/tmp/stuff", @@ -24,11 +27,13 @@ def run(self): class ProcessRemoteData(luigi.Task): + """ Create a toplist of users based on how many running processes they have on a remote machine In this example the processed data is stored in a MockFile """ + def requires(self): return CreateRemoteData() diff --git a/examples/terasort.py b/examples/terasort.py index e274820bbd..12a7c5870a 100644 --- a/examples/terasort.py +++ b/examples/terasort.py @@ -25,11 +25,12 @@ def hadoop_examples_jar(): class TeraGen(luigi.hadoop_jar.HadoopJarJobTask): + """Runs TeraGen, by default with 1TB of data (10B records)""" records = luigi.Parameter(default="10000000000", - description="Number of records, each record is 100 Bytes") + description="Number of records, each record is 100 Bytes") terasort_in = luigi.Parameter(default=DEFAULT_TERASORT_IN, - description="directory to store terasort input into.") + description="directory to store terasort input into.") def output(self): return luigi.hdfs.HdfsTarget(self.terasort_in) @@ -46,12 +47,13 @@ def args(self): class TeraSort(luigi.hadoop_jar.HadoopJarJobTask): + """Runs TeraGent, by default using """ terasort_in = luigi.Parameter(default=DEFAULT_TERASORT_IN, - description="directory to store terasort input into.") + description="directory to store terasort input into.") terasort_out = luigi.Parameter(default=DEFAULT_TERASORT_OUT, - description="directory to store terasort output into.") + description="directory to store terasort output into.") def requires(self): return TeraGen(terasort_in=self.terasort_in) diff --git a/examples/top_artists.py b/examples/top_artists.py index a9c4565d26..22c39e365e 100755 --- a/examples/top_artists.py +++ b/examples/top_artists.py @@ -1,10 +1,15 @@ import random -import luigi, luigi.hdfs, luigi.hadoop -import luigi.postgres -from heapq import nlargest from collections import defaultdict +from heapq import nlargest + +import luigi +import luigi.hadoop +import luigi.hdfs +import luigi.postgres + class ExternalStreams(luigi.ExternalTask): + ''' Example of a possible external data dump To depend on external targets (typically at the top of your dependency graph), you can define @@ -16,14 +21,16 @@ def output(self): return luigi.hdfs.HdfsTarget(self.date.strftime( 'data/streams_%Y-%m-%d.tsv')) + class Streams(luigi.Task): + ''' Faked version right now, just generates bogus data. ''' date = luigi.DateParameter() def run(self): with self.output().open('w') as output: - for i in xrange(1000): + for _ in xrange(1000): output.write('{} {} {}\n'.format( random.randint(0, 999), random.randint(0, 999), @@ -33,10 +40,13 @@ def output(self): return luigi.LocalTarget(self.date.strftime( 'data/streams_%Y_%m_%d_faked.tsv')) + class StreamsHdfs(Streams): + def output(self): return luigi.hdfs.HdfsTarget(self.date.strftime('data/streams_%Y_%m_%d_faked.tsv')) + class AggregateArtists(luigi.Task): date_interval = luigi.DateIntervalParameter() @@ -50,16 +60,17 @@ def requires(self): def run(self): artist_count = defaultdict(int) - for input in self.input(): - with input.open('r') as in_file: + for t in self.input(): + with t.open('r') as in_file: for line in in_file: - timestamp, artist, track = line.strip().split() + _, artist, track = line.strip().split() artist_count[artist] += 1 with self.output().open('w') as out_file: for artist, count in artist_count.iteritems(): out_file.write('{}\t{}\n'.format(artist, count)) + class AggregateArtistsHadoop(luigi.hadoop.JobTask): date_interval = luigi.DateIntervalParameter() @@ -79,9 +90,10 @@ def mapper(self, line): def reducer(self, key, values): yield key, sum(values) + class Top10Artists(luigi.Task): date_interval = luigi.DateIntervalParameter() - use_hadoop = luigi.BooleanParameter() + use_hadoop = luigi.BoolParameter() def requires(self): if self.use_hadoop: @@ -110,9 +122,10 @@ def _input_iterator(self): artist, streams = line.strip().split() yield int(streams), artist + class ArtistToplistToDatabase(luigi.postgres.CopyToTable): date_interval = luigi.DateIntervalParameter() - use_hadoop = luigi.BooleanParameter() + use_hadoop = luigi.BoolParameter() host = "localhost" database = "toplists" diff --git a/examples/wordcount.py b/examples/wordcount.py index 92472d6aa8..a0078203f3 100644 --- a/examples/wordcount.py +++ b/examples/wordcount.py @@ -2,13 +2,16 @@ class InputText(luigi.ExternalTask): + ''' This class represents something that was created elsewhere by an external process, so all we want to do is to implement the output method. ''' date = luigi.DateParameter() + def output(self): return luigi.LocalTarget(self.date.strftime('/var/tmp/text/%Y-%m-%d.txt')) + class WordCount(luigi.Task): date_interval = luigi.DateIntervalParameter() @@ -20,8 +23,8 @@ def output(self): def run(self): count = {} - for file in self.input(): # The input() method is a wrapper around requires() that returns Target objects - for line in file.open('r'): # Target objects are a file system/format abstraction and this will return a file stream object + for f in self.input(): # The input() method is a wrapper around requires() that returns Target objects + for line in f.open('r'): # Target objects are a file system/format abstraction and this will return a file stream object for word in line.strip().split(): count[word] = count.get(word, 0) + 1 @@ -29,7 +32,7 @@ def run(self): f = self.output().open('w') for word, count in count.iteritems(): f.write("%s\t%d\n" % (word, count)) - f.close() # Note that this is essential because file system operations are atomic + f.close() # Note that this is essential because file system operations are atomic if __name__ == '__main__': luigi.run(main_task_cls=WordCount) diff --git a/examples/wordcount_hadoop.py b/examples/wordcount_hadoop.py index b11e5a093c..7a89b3e7f4 100644 --- a/examples/wordcount_hadoop.py +++ b/examples/wordcount_hadoop.py @@ -1,15 +1,21 @@ -import luigi, luigi.hadoop, luigi.hdfs +import luigi +import luigi.hadoop +import luigi.hdfs + # To make this run, you probably want to edit /etc/luigi/client.cfg and add something like: # # [hadoop] # jar: /usr/lib/hadoop-xyz/hadoop-streaming-xyz-123.jar + class InputText(luigi.ExternalTask): date = luigi.DateParameter() + def output(self): return luigi.hdfs.HdfsTarget(self.date.strftime('/tmp/text/%Y-%m-%d.txt')) + class WordCount(luigi.hadoop.JobTask): date_interval = luigi.DateIntervalParameter() diff --git a/luigi/__init__.py b/luigi/__init__.py index ced49b85e9..efa5442ddb 100644 --- a/luigi/__init__.py +++ b/luigi/__init__.py @@ -1,4 +1,4 @@ -"""Package containing core luigi functionality""" +""" Package containing core luigi functionality.""" # Copyright (c) 2012 Spotify AB # # Licensed under the Apache License, Version 2.0 (the "License"); you may not @@ -25,6 +25,8 @@ Event = event.Event Task = task.Task +Config = task.Config +ConfigWithoutSection = task.ConfigWithoutSection ExternalTask = task.ExternalTask WrapperTask = task.WrapperTask Target = target.Target @@ -40,10 +42,12 @@ # TODO: how can we get rid of these? DateHourParameter = parameter.DateHourParameter +DateMinuteParameter = parameter.DateMinuteParameter DateParameter = parameter.DateParameter IntParameter = parameter.IntParameter FloatParameter = parameter.FloatParameter -BooleanParameter = parameter.BooleanParameter +BooleanParameter = parameter.BooleanParameter # backward compatibility +BoolParameter = parameter.BoolParameter DateIntervalParameter = parameter.DateIntervalParameter TimeDeltaParameter = parameter.TimeDeltaParameter diff --git a/luigi/configuration.py b/luigi/configuration.py index c3859230b4..0b4171c063 100644 --- a/luigi/configuration.py +++ b/luigi/configuration.py @@ -1,6 +1,19 @@ +# Copyright (c) 2015 Spotify AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. -import os import logging +import os from ConfigParser import ConfigParser, NoOptionError, NoSectionError @@ -31,9 +44,12 @@ def reload(cls): return cls.instance().read(cls._config_paths) def _get_with_default(self, method, section, option, default, expected_type=None): - """ Gets the value of the section/option using method. Returns default if value - is not found. Raises an exception if the default value is not None and doesn't match - the expected_type. + """ + Gets the value of the section/option using method. + + Returns default if value is not found. + + Raises an exception if the default value is not None and doesn't match the expected_type. """ try: return method(self, section, option) @@ -63,12 +79,15 @@ def getintdict(self, section): except NoSectionError: return {} - def set(self, section, option, value): + def set(self, section, option, value=None): if not ConfigParser.has_section(self, section): ConfigParser.add_section(self, section) return ConfigParser.set(self, section, option, value) + def get_config(): - """ Convenience method (for backwards compatibility) for accessing config singleton """ + """ + Convenience method (for backwards compatibility) for accessing config singleton. + """ return LuigiConfigParser.instance() diff --git a/luigi/contrib/__init__.py b/luigi/contrib/__init__.py index 2bbd518b54..44e46acef7 100644 --- a/luigi/contrib/__init__.py +++ b/luigi/contrib/__init__.py @@ -1 +1,3 @@ -"""Package containing optional and-on functionality.""" \ No newline at end of file +""" +Package containing optional and-on functionality. +""" diff --git a/luigi/contrib/esindex.py b/luigi/contrib/esindex.py index 4585e50fcb..c5bd164c99 100644 --- a/luigi/contrib/esindex.py +++ b/luigi/contrib/esindex.py @@ -6,10 +6,11 @@ Provides an `ElasticsearchTarget` and a `CopyToIndex` template task. Modeled after `luigi.contrib.rdbms.CopyToTable`. ----- A minimal example (assuming elasticsearch is running on localhost:9200): +.. code-block:: python + class ExampleIndex(CopyToIndex): index = 'example' @@ -20,10 +21,10 @@ def docs(self): task = ExampleIndex() luigi.build([task], local_scheduler=True) ----- - All options: +.. code-block:: python + class ExampleIndex(CopyToIndex): host = 'localhost' port = 9200 @@ -39,22 +40,25 @@ def docs(self): task = ExampleIndex() luigi.build([task], local_scheduler=True) ----- - `Host`, `port`, `index`, `doc_type` parameters are standard elasticsearch. `purge_existing_index` will delete the index, whenever an update is required. -This is useful, when one deals with "dumps" that represent the whole data, -not just updates. +This is useful, when one deals with "dumps" that represent the whole data, not just updates. `marker_index_hist_size` sets the maximum number of entries in the 'marker' -index. Keep all updates by default (0). Use 1 to only remember the most recent -update to the index. This can be useful, if an index needs to recreated, even -though the corresponding indexing task has been run sometime in the past - but +index: + +* 0 (default) keeps all updates, +* 1 to only remember the most recent update to the index. + +This can be useful, if an index needs to recreated, even though +the corresponding indexing task has been run sometime in the past - but a later indexing task might have altered the index in the meantime. There are a two luigi `client.cfg` configuration options: +.. code-block:: ini + [elasticsearch] marker-index = update_log @@ -68,6 +72,7 @@ def docs(self): import hashlib import json import logging + import luigi logger = logging.getLogger('luigi-interface') @@ -85,23 +90,28 @@ def docs(self): class ElasticsearchTarget(luigi.Target): - """ Target for a resource in Elasticsearch. """ + """ Target for a resource in Elasticsearch.""" marker_index = luigi.configuration.get_config().get('elasticsearch', - 'marker-index', 'update_log') + 'marker-index', 'update_log') marker_doc_type = luigi.configuration.get_config().get('elasticsearch', - 'marker-doc-type', 'entry') + 'marker-doc-type', 'entry') def __init__(self, host, port, index, doc_type, update_id, marker_index_hist_size=0, http_auth=None): """ - Args: - host (str): Elasticsearch server host - port (int): Elasticsearch server port - index (str): Index name - doc_type (str): Doctype name - update_id (str): An identifier for this data set - marker_index_hist_size (int): List of changes to the index to remember + :param host: Elasticsearch server host + :type host: str + :param port: Elasticsearch server port + :type port: int + :param index: index name + :type index: str + :param doc_type: doctype name + :type doc_type: str + :param update_id: an identifier for this data set + :type update_id: str + :param marker_index_hist_size: list of changes to the index to remember + :type marker_index_hist_size: int """ self.host = host self.port = port @@ -119,28 +129,34 @@ def __init__(self, host, port, index, doc_type, update_id, ) def marker_index_document_id(self): - """ Generate an id for the indicator document. """ + """ + Generate an id for the indicator document. + """ params = '%s:%s:%s' % (self.index, self.doc_type, self.update_id) return hashlib.sha1(params).hexdigest() def touch(self): - """ Mark this update as complete. The document id would be sufficent, - but we index the parameters - - (update_id, target_index, target_doc_type, date) + """ + Mark this update as complete. - as well for documentation. """ + The document id would be sufficent but, + for documentation, + we index the parameters `update_id`, `target_index`, `target_doc_type` and `date` as well. + """ self.create_marker_index() self.es.index(index=self.marker_index, doc_type=self.marker_doc_type, id=self.marker_index_document_id(), body={ - 'update_id': self.update_id, 'target_index': self.index, - 'target_doc_type': self.doc_type, - 'date': datetime.datetime.now()}) + 'update_id': self.update_id, + 'target_index': self.index, + 'target_doc_type': self.doc_type, + 'date': datetime.datetime.now()}) self.es.indices.flush(index=self.marker_index) self.ensure_hist_size() def exists(self): - """ Test, if this task has been run. """ + """ + Test, if this task has been run. + """ try: _ = self.es.get(index=self.marker_index, doc_type=self.marker_doc_type, @@ -153,13 +169,17 @@ def exists(self): return False def create_marker_index(self): - """ Create the index that will keep track of the tasks if necessary. """ + """ + Create the index that will keep track of the tasks if necessary. + """ if not self.es.indices.exists(index=self.marker_index): self.es.indices.create(index=self.marker_index) def ensure_hist_size(self): - """ Shrink the history of updates for a `index/doc_type` combination - down to `self.marker_index_hist_size`. """ + """ + Shrink the history of updates for + a `index/doc_type` combination down to `self.marker_index_hist_size`. + """ if self.marker_index_hist_size == 0: return result = self.es.search(index=self.marker_index, @@ -184,10 +204,10 @@ class CopyToIndex(luigi.Task): 1. Subclass and override the required `index` attribute. - 2. Implement a custom `docs` method, that returns an iterable over - the documents. A document can be a JSON string, e.g. from - a newline-delimited JSON (ldj) file (default implementation) or some - dictionary. + 2. Implement a custom `docs` method, that returns an iterable over the documents. + A document can be a JSON string, + e.g. from a newline-delimited JSON (ldj) file (default implementation) + or some dictionary. Optional attributes: @@ -206,71 +226,97 @@ class CopyToIndex(luigi.Task): @property def host(self): - """ ES hostname """ + """ + ES hostname. + """ return 'localhost' @property def port(self): - """ ES port """ + """ + ES port. + """ return 9200 @property def http_auth(self): """ - ES optional http auth information - as either ‘:’ separated string or a tuple. - eg: ` ('user', 'pass') ` or ` "user:pass" ` + ES optional http auth information as either ‘:’ separated string or a tuple, + e.g. `('user', 'pass')` or `"user:pass"`. """ return None @abc.abstractproperty def index(self): - """ The target index. May exists or not. """ + """ + The target index. + + May exist or not. + """ return None @property def doc_type(self): - """ The target doc_type. """ + """ + The target doc_type. + """ return 'default' @property def mapping(self): - """ Dictionary with custom mapping or `None`. """ + """ + Dictionary with custom mapping or `None`. + """ return None @property def settings(self): - """ Settings to be used at index creation time. """ + """ + Settings to be used at index creation time. + """ return {'settings': {}} @property def chunk_size(self): - """ Single API call for this number of docs. """ + """ + Single API call for this number of docs. + """ return 2000 @property def raise_on_error(self): - """ Whether to fail fast. """ + """ + Whether to fail fast. + """ return True @property def purge_existing_index(self): - """ Whether to delete the `index` completely before any indexing. """ + """ + Whether to delete the `index` completely before any indexing. + """ return False @property def marker_index_hist_size(self): - """ Number of event log entries in the marker index. 0: unlimited. """ + """ + Number of event log entries in the marker index. 0: unlimited. + """ return 0 @property def timeout(self): - """ Timeout. """ + """ + Timeout. + """ return 10 def docs(self): - """ Return the documents to be indexed. Beside the user defined - fields, the document may contain an `_index`, `_type` and `_id`. """ + """ + Return the documents to be indexed. + + Beside the user defined fields, the document may contain an `_index`, `_type` and `_id`. + """ with self.input().open('r') as fobj: for line in fobj: yield line @@ -278,8 +324,9 @@ def docs(self): # everything below will rarely have to be overridden def _docs(self): - """ Since `self.docs` may yield documents that do not explicitly - contain `_index` or `_type`, add those attributes here, if necessary. + """ + Since `self.docs` may yield documents that do not explicitly contain `_index` or `_type`, + add those attributes here, if necessary. """ first = iter(self.docs()).next() needs_parsing = False @@ -308,7 +355,8 @@ def _init_connection(self): ) def create_index(self): - """ Override to provide code for creating the target index. + """ + Override to provide code for creating the target index. By default it will be created without any special settings or mappings. """ @@ -317,17 +365,22 @@ def create_index(self): es.indices.create(index=self.index, body=self.settings) def delete_index(self): - """ Delete the index, if it exists. """ + """ + Delete the index, if it exists. + """ es = self._init_connection() if es.indices.exists(index=self.index): es.indices.delete(index=self.index) def update_id(self): - """ This id will be a unique identifier for this indexing task.""" + """ + This id will be a unique identifier for this indexing task. + """ return self.task_id def output(self): - """ Returns a ElasticsearchTarget representing the inserted dataset. + """ + Returns a ElasticsearchTarget representing the inserted dataset. Normally you don't override this. """ @@ -339,15 +392,20 @@ def output(self): doc_type=self.doc_type, update_id=self.update_id(), marker_index_hist_size=self.marker_index_hist_size - ) + ) def run(self): - """ Purge existing index, if requested (`purge_existing_index`). - Create the index, if missing. Apply mappings, if given. - Set refresh interval to -1 (disable) for performance reasons. - Bulk index in batches of size `chunk_size` (2000). - Set refresh interval to 1s. Refresh Elasticsearch. - Create entry in marker index. + """ + Run task, namely: + + * purge existing index, if requested (`purge_existing_index`), + * create the index, if missing, + * apply mappings, if given, + * set refresh interval to -1 (disable) for performance reasons, + * bulk index in batches of size `chunk_size` (2000), + * set refresh interval to 1s, + * refresh Elasticsearch, + * create entry in marker index. """ if self.purge_existing_index: self.delete_index() diff --git a/luigi/contrib/ftp.py b/luigi/contrib/ftp.py index ff9f68da2e..53bc6c9e62 100644 --- a/luigi/contrib/ftp.py +++ b/luigi/contrib/ftp.py @@ -1,5 +1,6 @@ """ -This library is a wrapper of ftplib. It is convenient to move data from/to FTP. +This library is a wrapper of ftplib. +It is convenient to move data from/to FTP. There is an example on how to use it (example/ftp_experiment_outputs.py) @@ -7,37 +8,62 @@ Be aware that normal ftp do not provide secure communication. """ +import datetime +import ftplib import os import random -import ftplib + import luigi -import luigi.target import luigi.format +import luigi.target from luigi.format import FileWrapper class RemoteFileSystem(luigi.target.FileSystem): - def __init__(self, host, username=None, password=None): + + def __init__(self, host, username=None, password=None, port=21, tls=False): self.host = host self.username = username self.password = password + self.port = port + self.tls = tls def _connect(self): - """ Log in to ftp """ - self.ftpcon = ftplib.FTP(self.host, self.username, self.password) + """ + Log in to ftp. + """ + if self.tls: + self.ftpcon = ftplib.FTP_TLS() + else: + self.ftpcon = ftplib.FTP() + self.ftpcon.connect(self.host, self.port) + self.ftpcon.login(self.username, self.password) + if self.tls: + self.ftpcon.prot_p() + + def exists(self, path, mtime=None): + """ + Return `True` if file or directory at `path` exist, False otherwise. + + Additional check on modified time when mtime is passed in. - def exists(self, path): - """ Return `True` if file or directory at `path` exist, False otherwise """ + Return False if the file's modified time is older mtime. + """ self._connect() files = self.ftpcon.nlst(path) - # empty list, means do not exists - if not files: - return False + result = False + if files: + if mtime: + mdtm = self.ftpcon.sendcmd('MDTM ' + path) + modified = datetime.datetime.strptime(mdtm[4:], "%Y%m%d%H%M%S") + result = modified > mtime + else: + result = True self.ftpcon.quit() - return True + return result def _rm_recursive(self, ftp, path): """ @@ -70,11 +96,14 @@ def _rm_recursive(self, ftp, path): print('_rm_recursive: Could not remove {0}: {1}'.format(path, e)) def remove(self, path, recursive=True): - """ Remove file or directory at location ``path`` + """ + Remove file or directory at location ``path``. - :param str path: a path within the FileSystem to remove. - :param bool recursive: if the path is a directory, recursively remove the directory and all - of its descendants. Defaults to ``True``. + :param path: a path within the FileSystem to remove. + :type path: str + :param recursive: if the path is a directory, recursively remove the directory and + all of its descendants. Defaults to ``True``. + :type recursive: bool """ self._connect() @@ -82,7 +111,7 @@ def remove(self, path, recursive=True): self._rm_recursive(self.ftpcon, path) else: try: - #try delete file + # try delete file self.ftpcon.delete(path) except ftplib.all_errors: # it is a folder, delete it @@ -125,18 +154,28 @@ def get(self, path, local_path): tmp_local_path = local_path + '-luigi-tmp-%09d' % random.randrange(0, 1e10) # download file self._connect() - self.ftpcon.retrbinary('RETR %s' % path, open(tmp_local_path, 'wb').write) + self.ftpcon.retrbinary('RETR %s' % path, open(tmp_local_path, 'wb').write) self.ftpcon.quit() os.rename(tmp_local_path, local_path) class AtomicFtpfile(file): - """ Simple class that writes to a temp file and upload to ftp on close(). - Also cleans up the temp file if close is not invoked. """ + Simple class that writes to a temp file and upload to ftp on close(). + + Also cleans up the temp file if close is not invoked. + """ + def __init__(self, fs, path): - self.__tmp_path = self.path + '-luigi-tmp-%09d' % random.randrange(0, 1e10) + """ + Initializes an AtomicFtpfile instance. + + :param fs: + :param path: + :type path: str + """ + self.__tmp_path = '%s-luigi-tmp-%09d' % (path, random.randrange(0, 1e10)) self._fs = fs self.path = path super(AtomicFtpfile, self).__init__(self.__tmp_path, 'w') @@ -162,6 +201,7 @@ def fs(self): def __exit__(self, exc_type, exc, traceback): """ Close/commit the file if there are no exception + Upload file to ftp """ if exc_type: @@ -171,27 +211,33 @@ def __exit__(self, exc_type, exc, traceback): class RemoteTarget(luigi.target.FileSystemTarget): """ - Target used for reading from remote files. The target is implemented using - ssh commands streaming data over the network. + Target used for reading from remote files. + + The target is implemented using ssh commands streaming data over the network. """ - def __init__(self, path, host, format=None, username=None, password=None): + + def __init__(self, path, host, format=None, username=None, password=None, port=21, mtime=None, tls=False): self.path = path + self.mtime = mtime self.format = format - self._fs = RemoteFileSystem(host, username, password) + self.tls = tls + self._fs = RemoteFileSystem(host, username, password, port, tls) @property def fs(self): return self._fs def open(self, mode): - """Open the FileSystem target. + """ + Open the FileSystem target. This method returns a file-like object which can either be read from or written to depending on the specified mode. - :param str mode: the mode `r` opens the FileSystemTarget in read-only mode, whereas `w` will - open the FileSystemTarget in write mode. Subclasses can implement - additional options. + :param mode: the mode `r` opens the FileSystemTarget in read-only mode, whereas `w` will + open the FileSystemTarget in write mode. Subclasses can implement + additional options. + :type mode: str """ if mode == 'w': if self.format: @@ -212,6 +258,9 @@ def open(self, mode): else: raise Exception('mode must be r/w') + def exists(self): + return self.fs.exists(self.path, self.mtime) + def put(self, local_path): self.fs.put(local_path, self.path) diff --git a/luigi/contrib/mysqldb.py b/luigi/contrib/mysqldb.py index 341e73286d..83573871fa 100644 --- a/luigi/contrib/mysqldb.py +++ b/luigi/contrib/mysqldb.py @@ -13,19 +13,26 @@ class MySqlTarget(luigi.Target): - """Target for a resource in MySql""" + """ + Target for a resource in MySql. + """ marker_table = luigi.configuration.get_config().get('mysql', 'marker-table', 'table_updates') def __init__(self, host, database, user, password, table, update_id): """ - Args: - host (str): MySql server address. Possibly a host:port string. - database (str): Database name - user (str): Database user - password (str): Password for specified user - update_id (str): An identifier for this data set - + Initializes a MySqlTarget instance. + + :param host: MySql server address. Possibly a host:port string. + :type host: str + :param database: database name. + :type database: str + :param user: database user + :type user: str + :param password: password for specified user. + :type password: str + :param update_id: an identifier for this data set. + :type update_id: str """ if ':' in host: self.host, self.port = host.split(':') @@ -40,10 +47,12 @@ def __init__(self, host, database, user, password, table, update_id): self.update_id = update_id def touch(self, connection=None): - """Mark this update as complete. + """ + Mark this update as complete. - Important: If the marker table doesn't exist, the connection transaction will be aborted - and the connection reset. Then the marker table will be created. + IMPORTANT, If the marker table doesn't exist, + the connection transaction will be aborted and the connection reset. + Then the marker table will be created. """ self.create_marker_table() @@ -71,11 +80,11 @@ def exists(self, connection=None): cursor.execute("""SELECT 1 FROM {marker_table} WHERE update_id = %s LIMIT 1""".format(marker_table=self.marker_table), - (self.update_id,) - ) + (self.update_id,) + ) row = cursor.fetchone() except mysql.connector.Error as e: - if e.errno == errorcode.ER_NO_SUCH_TABLE: + if e.errno == errorcode.ER_NO_SUCH_TABLE: row = None else: raise @@ -91,9 +100,11 @@ def connect(self, autocommit=False): return connection def create_marker_table(self): - """Create marker table if it doesn't exist. + """ + Create marker table if it doesn't exist. - Using a separate connection since the transaction might have to be reset""" + Using a separate connection since the transaction might have to be reset. + """ connection = self.connect(autocommit=True) cursor = connection.cursor() try: diff --git a/luigi/contrib/pig.py b/luigi/contrib/pig.py new file mode 100644 index 0000000000..80a561a2d2 --- /dev/null +++ b/luigi/contrib/pig.py @@ -0,0 +1,185 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +import datetime +import logging +import os +import select +import signal +import subprocess +import tempfile + +import luigi +from luigi import configuration + +logger = logging.getLogger('luigi-interface') + +""" +Apache Pig support. + +Example configuration section in client.cfg: +[pig] +# pig home directory +home: /usr/share/pig +""" + + +class PigJobTask(luigi.Task): + + def pig_home(self): + return configuration.get_config().get('pig', 'home', '/usr/share/pig') + + def pig_command_path(self): + return os.path.join(self.pig_home(), "bin/pig") + + def pig_env_vars(self): + """ + Dictionary of environment variables that should be set when running Pig. + + Ex: + return { 'PIG_CLASSPATH': '/your/path' } + """ + return {} + + def pig_properties(self): + """ + Dictionary of properties that should be set when running Pig. + + Ex: + return { 'pig.additional.jars':'/path/to/your/jar' } + """ + return {} + + def pig_parameters(self): + """ + Dictionary of parameters that should be set for the Pig job. + Ex: + return { 'YOUR_PARAM_NAME':'Your param value' } + """ + return {} + + def pig_options(self): + """ + List of options that will be appended to the Pig command. + Ex: + return ['-x', 'local'] + """ + return [] + + def output(self): + raise NotImplementedError("subclass should define output path") + + def pig_script_path(self): + """ + Return the path to the Pig script to be run. + """ + raise NotImplementedError("subclass should define pig_script_path") + + def _build_pig_cmd(self): + opts = self.pig_options() + + for k, v in self.pig_parameters().iteritems(): + opts.append("-p") + opts.append("%s=%s" % (k, v)) + + if self.pig_properties(): + with open('pig_property_file', 'w') as prop_file: + prop_file.writelines(["%s=%s%s" % (k, v, os.linesep) for (k, v) in self.pig_properties().iteritems()]) + opts.append('-propertyFile') + opts.append('pig_property_file') + + cmd = [self.pig_command_path()] + opts + ["-f", self.pig_script_path()] + + logger.info(' '.join(cmd)) + return cmd + + def run(self): + cmd = self._build_pig_cmd() + self.track_and_progress(cmd) + + def track_and_progress(self, cmd): + temp_stdout = tempfile.TemporaryFile() + env = os.environ.copy() + env['PIG_HOME'] = self.pig_home() + for k, v in self.pig_env_vars().iteritems(): + env[k] = v + + proc = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) + reads = [proc.stderr.fileno(), proc.stdout.fileno()] + # tracking the possible problems with this job + err_lines = [] + with PigRunContext() as pig_context: + while proc.poll() is None: + ret = select.select(reads, [], []) + for fd in ret[0]: + if fd == proc.stderr.fileno(): + line = proc.stderr.readline() + err_lines.append(line) + if fd == proc.stdout.fileno(): + line = proc.stdout.readline() + temp_stdout.write(line) + + err_line = line.lower() + if err_line.find('More information at:') != -1: + logger.info(err_line.split('more information at: ')[-1].strip()) + if err_line.find(' - '): + t = err_line.split(' - ')[-1].strip() + if t != "": + logger.info(t) + + # Read the rest + stdout + err = ''.join(err_lines + [err_line for err_line in proc.stderr]) + if proc.returncode == 0: + logger.info("Job completed successfully!") + else: + logger.error("Error when running script:\n%s", self.pig_script_path()) + logger.error(err) + raise PigJobError("Pig script failed with return value: %s" % (proc.returncode,), err=err) + + +class PigRunContext(object): + def __init__(self): + self.job_id = None + + def __enter__(self): + self.__old_signal = signal.getsignal(signal.SIGTERM) + signal.signal(signal.SIGTERM, self.kill_job) + return self + + def kill_job(self, captured_signal=None, stack_frame=None): + if self.job_id: + logger.info('Job interrupted, killing job %s', self.job_id) + subprocess.call(['pig', '-e', '"kill %s"' % self.job_id]) + if captured_signal is not None: + # adding 128 gives the exit code corresponding to a signal + sys.exit(128 + captured_signal) + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is KeyboardInterrupt: + self.kill_job() + signal.signal(signal.SIGTERM, self.__old_signal) + + +class PigJobError(RuntimeError): + def __init__(self, message, out=None, err=None): + super(PigJobError, self).__init__(message, out, err) + self.message = message + self.out = out + self.err = err + + def __str__(self): + info = self.message + if self.out: + info += "\nSTDOUT: " + str(self.out) + if self.err: + info += "\nSTDERR: " + str(self.err) + return info diff --git a/luigi/contrib/rdbms.py b/luigi/contrib/rdbms.py index 0d5dc71994..10023c59bd 100644 --- a/luigi/contrib/rdbms.py +++ b/luigi/contrib/rdbms.py @@ -12,11 +12,18 @@ class CopyToTable(luigi.Task): """ - An abstract task for inserting a data set into RDBMS + An abstract task for inserting a data set into RDBMS. Usage: - Subclass and override the required `host`, `database`, `user`, - `password`, `table` and `columns` attributes. + + Subclass and override the following attributes: + + * `host`, + * `database`, + * `user`, + * `password`, + * `table` + * `columns` """ @abc.abstractproperty @@ -51,9 +58,9 @@ def table(self): column_separator = "\t" # how columns are separated in the file copied into postgres - def create_table(self, connection): - """ Override to provide code for creating the target table. + """ + Override to provide code for creating the target table. By default it will be created using types (optionally) specified in columns. @@ -71,9 +78,10 @@ def create_table(self, connection): query = "CREATE TABLE {table} ({coldefs})".format(table=self.table, coldefs=coldefs) connection.cursor().execute(query) - def update_id(self): - """This update id will be a unique identifier for this insert on this table.""" + """ + This update id will be a unique identifier for this insert on this table. + """ return self.task_id @abc.abstractmethod @@ -81,9 +89,12 @@ def output(self): raise NotImplementedError("This method must be overridden") def init_copy(self, connection): - """ Override to perform custom queries. + """ + Override to perform custom queries. - Any code here will be formed in the same transaction as the main copy, just prior to copying data. Example use cases include truncating the table or removing all data older than X in the database to keep a rolling window of data available in the table. + Any code here will be formed in the same transaction as the main copy, just prior to copying data. + Example use cases include truncating the table or removing all data older than X in the database + to keep a rolling window of data available in the table. """ # TODO: remove this after sufficient time so most people using the diff --git a/luigi/contrib/redshift.py b/luigi/contrib/redshift.py index e560417776..69aedf5bc8 100644 --- a/luigi/contrib/redshift.py +++ b/luigi/contrib/redshift.py @@ -1,14 +1,13 @@ import abc +import json import logging -import luigi.postgres +import time + import luigi -import json -from luigi.contrib import rdbms from luigi import postgres - +from luigi.contrib import rdbms from luigi.s3 import S3PathTask, S3Target - logger = logging.getLogger('luigi-interface') @@ -16,16 +15,21 @@ import psycopg2 import psycopg2.errorcodes except ImportError: - logger.warning("Loading postgres module without psycopg2 installed. Will crash at runtime if postgres functionality is used.") + logger.warning("Loading postgres module without psycopg2 installed. " + "Will crash at runtime if postgres functionality is used.") class RedshiftTarget(postgres.PostgresTarget): """ Target for a resource in Redshift. - Redshift is similar to postgres with a few adjustments required by redshift + Redshift is similar to postgres with a few adjustments + required by redshift. """ - marker_table = luigi.configuration.get_config().get('redshift', 'marker-table', 'table_updates') + marker_table = luigi.configuration.get_config().get( + 'redshift', + 'marker-table', + 'table_updates') use_db_timestamps = False @@ -35,70 +39,128 @@ class S3CopyToTable(rdbms.CopyToTable): Template task for inserting a data set into Redshift from s3. Usage: - Subclass and override the required attributes: - `host`, `database`, `user`, `password`, `table`, `columns`, - `aws_access_key_id`, `aws_secret_access_key`, `s3_load_path` + + * Subclass and override the required attributes: + * `host`, + * `database`, + * `user`, + * `password`, + * `table`, + * `columns`, + * `aws_access_key_id`, + * `aws_secret_access_key`, + * `s3_load_path`. """ @abc.abstractproperty def s3_load_path(self): - 'override to return the load path' + """ + Override to return the load path. + """ return None @abc.abstractproperty def aws_access_key_id(self): - 'override to return the key id' + """ + Override to return the key id. + """ return None @abc.abstractproperty def aws_secret_access_key(self): - 'override to return the secret access key' + """ + Override to return the secret access key. + """ return None @abc.abstractproperty def copy_options(self): - '''Add extra copy options, for example: + """ + Add extra copy options, for example: + + * TIMEFORMAT 'auto' + * IGNOREHEADER 1 + * TRUNCATECOLUMNS + * IGNOREBLANKLINES + """ + return '' - TIMEFORMAT 'auto' - IGNOREHEADER 1 - TRUNCATECOLUMNS - IGNOREBLANKLINES + def table_attributes(self): + '''Add extra table attributes, for example: + DISTSTYLE KEY + DISTKEY (MY_FIELD) + SORTKEY (MY_FIELD_2, MY_FIELD_3) ''' return '' + def do_truncate_table(self): + """ + Return True if table should be truncated before copying new data in. + """ + return False + + def truncate_table(self, connection): + query = "truncate %s" % self.table + cursor = connection.cursor() + try: + cursor.execute(query) + finally: + cursor.close() + + def create_table(self, connection): + """ + Override to provide code for creating the target table. + + By default it will be created using types (optionally) + specified in columns. + + If overridden, use the provided connection object for + setting up the table in order to create the table and + insert data using the same transaction. + """ + if len(self.columns[0]) == 1: + # only names of columns specified, no types + raise NotImplementedError("create_table() not implemented " + "for %r and columns types not " + "specified" % self.table) + elif len(self.columns[0]) == 2: + # if columns is specified as (name, type) tuples + coldefs = ','.join( + '{name} {type}'.format( + name=name, + type=type) for name, type in self.columns + ) + query = ("CREATE TABLE " + "{table} ({coldefs}) " + "{table_attributes}").format( + table=self.table, + coldefs=coldefs, + table_attributes=self.table_attributes()) + connection.cursor().execute(query) + def run(self): """ - If the target table doesn't exist, self.create_table will be called - to attempt to create the table. + If the target table doesn't exist, self.create_table + will be called to attempt to create the table. """ if not (self.table): raise Exception("table need to be specified") + path = self.s3_load_path() connection = self.output().connect() + if not self.does_table_exist(connection): + # try creating table + logger.info("Creating table %s", self.table) + connection.reset() + self.create_table(connection) + elif self.do_truncate_table(): + logger.info("Truncating table %s", self.table) + self.truncate_table(connection) - path = self.s3_load_path() logger.info("Inserting file: %s", path) - - # attempt to copy the data into postgres - # if it fails because the target table doesn't exist - # try to create it by running self.create_table - for attempt in xrange(2): - try: - cursor = connection.cursor() - self.init_copy(connection) - self.copy(cursor, path) - except psycopg2.ProgrammingError, e: - if e.pgcode == psycopg2.errorcodes.UNDEFINED_TABLE and attempt == 0: - # if first attempt fails with "relation not found", - # try creating table - logger.info("Creating table %s", self.table) - connection.reset() - self.create_table(connection) - else: - raise - else: - break - + cursor = connection.cursor() + self.init_copy(connection) + self.copy(cursor, path) self.output().touch(connection) connection.commit() @@ -106,9 +168,9 @@ def run(self): connection.close() def copy(self, cursor, f): - ''' - Defines copying from s3 into redshift - ''' + """ + Defines copying from s3 into redshift. + """ cursor.execute(""" COPY %s from '%s' @@ -120,17 +182,33 @@ def copy(self, cursor, f): self.copy_options)) def output(self): - """Returns a RedshiftTarget representing the inserted dataset. + """ + Returns a RedshiftTarget representing the inserted dataset. Normally you don't override this. """ return RedshiftTarget( - host=self.host, - database=self.database, - user=self.user, - password=self.password, - table=self.table, - update_id=self.update_id()) + host=self.host, + database=self.database, + user=self.user, + password=self.password, + table=self.table, + update_id=self.update_id()) + + def does_table_exist(self, connection): + """ + Determine whether the table already exists. + """ + query = ("select 1 as table_exists " + "from pg_table_def " + "where tablename = %s limit 1") + cursor = connection.cursor() + try: + cursor.execute(query, (self.table,)) + result = cursor.fetchone() + return bool(result) + finally: + cursor.close() class S3CopyJSONToTable(S3CopyToTable): @@ -138,29 +216,43 @@ class S3CopyJSONToTable(S3CopyToTable): Template task for inserting a JSON data set into Redshift from s3. Usage: - Subclass and override the required attributes: - `host`, `database`, `user`, `password`, `table`, `columns`, - `aws_access_key_id`, `aws_secret_access_key`, `s3_load_path`, - `jsonpath`, `copy_json_options` + + * Subclass and override the required attributes: + + * `host`, + * `database`, + * `user`, + * `password`, + * `table`, + * `columns`, + * `aws_access_key_id`, + * `aws_secret_access_key`, + * `s3_load_path`, + * `jsonpath`, + * `copy_json_options`. """ @abc.abstractproperty def jsonpath(self): - 'override the jsonpath schema location for the table' + """ + Override the jsonpath schema location for the table. + """ return '' @abc.abstractproperty def copy_json_options(self): - '''Add extra copy options, for example: - GZIP - LZOP - ''' + """ + Add extra copy options, for example: + + * GZIP + * LZOP + """ return '' def copy(self, cursor, f): - ''' - Defines copying JSON from s3 into redshift - ''' + """ + Defines copying JSON from s3 into redshift. + """ cursor.execute(""" COPY %s from '%s' @@ -176,22 +268,26 @@ class RedshiftManifestTask(S3PathTask): """ Generic task to generate a manifest file that can be used in S3CopyToTable in order to copy multiple files from your - s3 folder into a redshift table at once + s3 folder into a redshift table at once. - For full description on how to use the manifest file see: + For full description on how to use the manifest file see http://docs.aws.amazon.com/redshift/latest/dg/loading-data-files-using-manifest.html Usage: - Requires parameters - path - s3 path to the generated manifest file, including the - name of the generated file - to be copied into a redshift table - folder_paths - s3 paths to the folders containing files you wish to be copied + + * requires parameters + * path - s3 path to the generated manifest file, including the + name of the generated file + to be copied into a redshift table + * folder_paths - s3 paths to the folders containing files you wish to be copied + Output: - generated manifest file + + * generated manifest file """ - # should be over ridden to point to a variety of folders you wish to copy from + # should be over ridden to point to a variety + # of folders you wish to copy from folder_paths = luigi.Parameter() def run(self): @@ -208,3 +304,99 @@ def run(self): target = self.output().open('w') target.write(json.dumps(manifest)) target.close() + + +class KillOpenRedshiftSessions(luigi.Task): + """ + An task for killing any open Redshift sessions + in a given database. This is necessary to prevent open user sessions + with transactions against the table from blocking drop or truncate + table commands. + + Usage: + + Subclass and override the required `host`, `database`, + `user`, and `password` attributes. + """ + + # time in seconds to wait before + # reconnecting to Redshift if our session is killed too. + # 30 seconds is usually fine; 60 is conservative + connection_reset_wait_seconds = luigi.IntParameter(default=60) + + @abc.abstractproperty + def host(self): + return None + + @abc.abstractproperty + def database(self): + return None + + @abc.abstractproperty + def user(self): + return None + + @abc.abstractproperty + def password(self): + return None + + def update_id(self): + """ + This update id will be a unique identifier + for this insert on this table. + """ + return self.task_id + + def output(self): + """ + Returns a RedshiftTarget representing the inserted dataset. + + Normally you don't override this. + """ + # uses class name as a meta-table + return RedshiftTarget( + host=self.host, + database=self.database, + user=self.user, + password=self.password, + table=self.__class__.__name__, + update_id=self.update_id()) + + def run(self): + """ + Kill any open Redshift sessions for the given database. + """ + connection = self.output().connect() + # kill any sessions other than ours and + # internal Redshift sessions (rdsdb) + query = ("select pg_terminate_backend(process) " + "from STV_SESSIONS " + "where db_name=%s " + "and user_name != 'rdsdb' " + "and process != pg_backend_pid()") + cursor = connection.cursor() + logger.info('Killing all open Redshift sessions for database: %s', self.database) + try: + cursor.execute(query, (self.database,)) + cursor.close() + connection.commit() + except psycopg2.DatabaseError, e: + if e.message and 'EOF' in e.message: + # sometimes this operation kills the current session. + # rebuild the connection. Need to pause for 30-60 seconds + # before Redshift will allow us back in. + connection.close() + logger.info('Pausing %s seconds for Redshift to reset connection', self.connection_reset_wait_seconds) + time.sleep(self.connection_reset_wait_seconds) + logger.info('Reconnecting to Redshift') + connection = self.output().connect() + else: + raise + + try: + self.output().touch(connection) + connection.commit() + finally: + connection.close() + + logger.info('Done killing all open Redshift sessions for database: %s', self.database) diff --git a/luigi/contrib/scalding.py b/luigi/contrib/scalding.py new file mode 100644 index 0000000000..024bc394cd --- /dev/null +++ b/luigi/contrib/scalding.py @@ -0,0 +1,288 @@ +import logging +import os +import re +import subprocess + +import luigi.configuration +import luigi.hadoop +import luigi.hadoop_jar +import luigi.hdfs +from luigi import LocalTarget +from luigi.task import flatten + +logger = logging.getLogger('luigi-interface') + +""" +Scalding support for Luigi. + +Example configuration section in client.cfg:: + + [scalding] + # scala home directory, which should include a lib subdir with scala jars. + scala-home: /usr/share/scala + + # scalding home directory, which should include a lib subdir with + # scalding-*-assembly-* jars as built from the official Twitter build script. + scalding-home: /usr/share/scalding + + # provided dependencies, e.g. jars required for compiling but not executing + # scalding jobs. Currently requred jars: + # org.apache.hadoop/hadoop-core/0.20.2 + # org.slf4j/slf4j-log4j12/1.6.6 + # log4j/log4j/1.2.15 + # commons-httpclient/commons-httpclient/3.1 + # commons-cli/commons-cli/1.2 + # org.apache.zookeeper/zookeeper/3.3.4 + scalding-provided: /usr/share/scalding/provided + + # additional jars required. + scalding-libjars: /usr/share/scalding/libjars +""" + + +class ScaldingJobRunner(luigi.hadoop.JobRunner): + """ + JobRunner for `pyscald` commands. Used to run a ScaldingJobTask. + """ + + def __init__(self): + conf = luigi.configuration.get_config() + + default = os.environ.get('SCALA_HOME', '/usr/share/scala') + self.scala_home = conf.get('scalding', 'scala-home', default) + + default = os.environ.get('SCALDING_HOME', '/usr/share/scalding') + self.scalding_home = conf.get('scalding', 'scalding-home', default) + self.provided_dir = conf.get( + 'scalding', 'scalding-provided', os.path.join(default, 'provided')) + self.libjars_dir = conf.get( + 'scalding', 'scalding-libjars', os.path.join(default, 'libjars')) + + self.tmp_dir = LocalTarget(is_tmp=True) + + def _get_jars(self, path): + return [os.path.join(path, j) for j in os.listdir(path) + if j.endswith('.jar')] + + def get_scala_jars(self, include_compiler=False): + lib_dir = os.path.join(self.scala_home, 'lib') + jars = [os.path.join(lib_dir, 'scala-library.jar')] + + # additional jar for scala 2.10 only + reflect = os.path.join(lib_dir, 'scala-reflect.jar') + if os.path.exists(reflect): + jars.append(reflect) + + if include_compiler: + jars.append(os.path.join(lib_dir, 'scala-compiler.jar')) + + return jars + + def get_scalding_jars(self): + lib_dir = os.path.join(self.scalding_home, 'lib') + return self._get_jars(lib_dir) + + def get_scalding_core(self): + lib_dir = os.path.join(self.scalding_home, 'lib') + for j in os.listdir(lib_dir): + if j.startswith('scalding-core-'): + p = os.path.join(lib_dir, j) + logger.debug('Found scalding-core: %s', p) + return p + raise luigi.hadoop.HadoopJobError('Coudl not find scalding-core.') + + def get_provided_jars(self): + return self._get_jars(self.provided_dir) + + def get_libjars(self): + return self._get_jars(self.libjars_dir) + + def get_tmp_job_jar(self, source): + job_name = os.path.basename(os.path.splitext(source)[0]) + return os.path.join(self.tmp_dir.path, job_name + '.jar') + + def get_build_dir(self, source): + build_dir = os.path.join(self.tmp_dir.path, 'build') + return build_dir + + def get_job_class(self, source): + # find name of the job class + # usually the one that matches file name or last class that extends Job + job_name = os.path.splitext(os.path.basename(source))[0] + package = None + job_class = None + for l in open(source).readlines(): + p = re.search(r'package\s+([^\s\(]+)', l) + if p: + package = p.groups()[0] + p = re.search(r'class\s+([^\s\(]+).*extends\s+.*Job', l) + if p: + job_class = p.groups()[0] + if job_class == job_name: + break + if job_class: + if package: + job_class = package + '.' + job_class + logger.debug('Found scalding job class: %s', job_class) + return job_class + else: + raise luigi.hadoop.HadoopJobError('Coudl not find scalding job class.') + + def build_job_jar(self, job): + job_jar = job.jar() + if job_jar: + if not os.path.exists(job_jar): + logger.error("Can't find jar: {0}, full path {1}".format( + job_jar, os.path.abspath(job_jar))) + raise Exception("job jar does not exist") + if not job.job_class(): + logger.error("Undefined job_class()") + raise Exception("Undefined job_class()") + return job_jar + + job_src = job.source() + if not job_src: + logger.error("Both source() and jar() undefined") + raise Exception("Both source() and jar() undefined") + if not os.path.exists(job_src): + logger.error("Can't find source: {0}, full path {1}".format( + job_src, os.path.abspath(job_src))) + raise Exception("job source does not exist") + + job_src = job.source() + job_jar = self.get_tmp_job_jar(job_src) + + build_dir = self.get_build_dir(job_src) + if not os.path.exists(build_dir): + os.makedirs(build_dir) + + classpath = ':'.join(filter(None, + self.get_scalding_jars() + + self.get_provided_jars() + + self.get_libjars() + + job.extra_jars())) + scala_cp = ':'.join(self.get_scala_jars(include_compiler=True)) + + # compile scala source + arglist = ['java', '-cp', scala_cp, 'scala.tools.nsc.Main', + '-classpath', classpath, + '-d', build_dir, job_src] + logger.info('Compiling scala source: %s', ' '.join(arglist)) + subprocess.check_call(arglist) + + # build job jar file + arglist = ['jar', 'cf', job_jar, '-C', build_dir, '.'] + logger.info('Building job jar: %s', ' '.join(arglist)) + subprocess.check_call(arglist) + return job_jar + + def run_job(self, job): + job_jar = self.build_job_jar(job) + jars = [job_jar] + self.get_libjars() + job.extra_jars() + scalding_core = self.get_scalding_core() + libjars = ','.join(filter(None, jars)) + arglist = luigi.hdfs.load_hadoop_cmd() + ['jar', scalding_core, '-libjars', libjars] + arglist += ['-D%s' % c for c in job.jobconfs()] + + job_class = job.job_class() or self.get_job_class(job.source()) + arglist += [job_class, '--hdfs'] + + # scalding does not parse argument with '=' properly + arglist += ['--name', job.task_id.replace('=', ':')] + + (tmp_files, job_args) = luigi.hadoop_jar.fix_paths(job) + arglist += job_args + + env = os.environ.copy() + jars.append(scalding_core) + hadoop_cp = ':'.join(filter(None, jars)) + env['HADOOP_CLASSPATH'] = hadoop_cp + logger.info("Submitting Hadoop job: HADOOP_CLASSPATH=%s %s", + hadoop_cp, ' '.join(arglist)) + luigi.hadoop.run_and_track_hadoop_job(arglist, env=env) + + for a, b in tmp_files: + a.move(b) + + +class ScaldingJobTask(luigi.hadoop.BaseHadoopJobTask): + """ + A job task for Scalding that define a scala source and (optional) main method. + + requires() should return a dictionary where the keys are Scalding argument + names and values are sub tasks or lists of subtasks. + + For example: + + .. code-block:: python + + {'input1': A, 'input2': C} => --input1 --input2 + {'input1': [A, B], 'input2': [C]} => --input1 --input2 + """ + + def relpath(self, current_file, rel_path): + """ + Compute path given current file and relative path. + """ + script_dir = os.path.dirname(os.path.abspath(current_file)) + rel_path = os.path.abspath(os.path.join(script_dir, rel_path)) + return rel_path + + def source(self): + """ + Path to the scala source for this Scalding Job + + Either one of source() or jar() must be specified. + """ + return None + + def jar(self): + """ + Path to the jar file for this Scalding Job + + Either one of source() or jar() must be specified. + """ + return None + + def extra_jars(self): + """ + Extra jars for building and running this Scalding Job. + """ + return [] + + def job_class(self): + """ + optional main job class for this Scalding Job. + """ + return None + + def job_runner(self): + return ScaldingJobRunner() + + def atomic_output(self): + """ + If True, then rewrite output arguments to be temp locations and + atomically move them into place after the job finishes. + """ + return True + + def requires(self): + return {} + + def job_args(self): + """ + Extra arguments to pass to the Scalding job. + """ + return [] + + def args(self): + """ + Returns an array of args to pass to the job. + """ + arglist = [] + for k, v in self.requires_hadoop().iteritems(): + arglist.append('--' + k) + arglist.extend([t.output().path for t in flatten(v)]) + arglist.extend(['--output', self.output()]) + arglist.extend(self.job_args()) + return arglist diff --git a/luigi/contrib/spark.py b/luigi/contrib/spark.py index d1a173a473..1aec6dd8e3 100644 --- a/luigi/contrib/spark.py +++ b/luigi/contrib/spark.py @@ -3,8 +3,8 @@ import os import random import re -import subprocess import signal +import subprocess import sys import tempfile import time @@ -14,7 +14,6 @@ import luigi.hdfs from luigi import configuration - logger = logging.getLogger('luigi-interface') """ @@ -36,6 +35,7 @@ class SparkRunContext(object): + def __init__(self): self.app_id = None @@ -66,12 +66,21 @@ def __exit__(self, exc_type, exc_val, exc_tb): class SparkJobError(RuntimeError): + def __init__(self, message, out=None, err=None): super(SparkJobError, self).__init__(message, out, err) self.message = message self.out = out self.err = err + def __str__(self): + info = self.message + if self.out: + info += "\nSTDOUT: " + str(self.out) + if self.err: + info += "\nSTDERR: " + str(self.err) + return info + class SparkJob(luigi.Task): spark_workers = None @@ -81,7 +90,9 @@ class SparkJob(luigi.Task): temp_hadoop_output_file = None def requires_local(self): - ''' Default impl - override this method if you need any local input to be accessible in init() ''' + """ + Default impl - override this method if you need any local input to be accessible in init(). + """ return [] def requires_hadoop(self): @@ -158,7 +169,7 @@ def run(self): raise SparkJobError('Spark job failed: see yarn logs for %s' % app_id) else: temp_stderr.seek(0) - errors = temp_stderr.readlines() + errors = "".join(temp_stderr.readlines()) logger.error(errors) raise SparkJobError('Spark job failed', err=errors) @@ -213,7 +224,9 @@ def jar(self): "containing job_class") def dependency_jars(self): - """Override to provide a list of dependency jars.""" + """ + Override to provide a list of dependency jars. + """ return [] def job_class(self): @@ -268,16 +281,15 @@ def run(self): .format(app_id)) elif return_code != 0: temp_stderr.seek(0) - errors = temp_stderr.readlines() + errors = "".join(temp_stderr.readlines()) logger.error(errors) raise SparkJobError('Spark job failed', err=errors) def track_progress(self, proc): """ - The Spark client currently outputs a multiline status to stdout every - second while the application is running. This instead captures status - data and updates a single line of output until the application - finishes. + The Spark client currently outputs a multiline status to stdout every second while the application is running. + + This instead captures status data and updates a single line of output until the application finishes. """ app_id = None app_status = 'N/A' @@ -318,7 +330,6 @@ def track_progress(self, proc): return proc.returncode, final_state, app_id - class PySpark1xJob(Spark1xJob): num_executors = None @@ -330,7 +341,9 @@ def program(self): raise NotImplementedError("subclass should define Spark .py file") def py_files(self): - """Override to provide a list of py files.""" + """ + Override to provide a list of py files. + """ return [] def run(self): @@ -362,6 +375,6 @@ def run(self): .format(app_id)) elif return_code != 0: temp_stderr.seek(0) - errors = temp_stderr.readlines() + errors = "".join(temp_stderr.readlines()) logger.error(errors) raise SparkJobError('Spark job failed', err=errors) diff --git a/luigi/contrib/sparkey.py b/luigi/contrib/sparkey.py index 857fd72247..f3e2746224 100644 --- a/luigi/contrib/sparkey.py +++ b/luigi/contrib/sparkey.py @@ -16,16 +16,19 @@ import luigi + class SparkeyExportTask(luigi.Task): - """ A luigi task that writes to a local sparkey log file. + """ + A luigi task that writes to a local sparkey log file. Subclasses should implement the requires and output methods. The output must be a luigi.LocalTarget. The resulting sparkey log file will contain one entry for every line in the input, mapping from the first value to a tab-separated list of the - rest of the line. To generate a simple key-value index, yield "key", "value" - pairs from the input(s) to this task. + rest of the line. + + To generate a simple key-value index, yield "key", "value" pairs from the input(s) to this task. """ # the separator used to split input lines @@ -55,4 +58,3 @@ def _write_sparkey_file(self): # move finished sparkey file to final destination temp_output.move(outfile.path) - diff --git a/luigi/contrib/sqla.py b/luigi/contrib/sqla.py new file mode 100644 index 0000000000..cee367db65 --- /dev/null +++ b/luigi/contrib/sqla.py @@ -0,0 +1,332 @@ +# Copyright (c) 2015 Gouthaman Balaraman +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +""" +Support for SQLAlchmey. Provides SQLAlchemyTarget for storing in databases +supported by SQLAlchemy. The user would be responsible for installing the +required database driver to connect using SQLAlchemy. + +Minimal example of a job to copy data to database using SQLAlchemy is as shown +below: + +.. code-block:: python + + from sqlalchemy import String + import luigi + from luigi.contrib import sqla + + class SQLATask(sqla.CopyToTable): + # columns defines the table schema, with each element corresponding + # to a column in the format (args, kwargs) which will be sent to + # the sqlalchemy.Column(*args, **kwargs) + columns = [ + (["item", String(64)], {"primary_key": True}), + (["property", String(64)], {}) + ] + connection_string = "sqlite://" # in memory SQLite database + table = "item_property" # name of the table to store data + + def rows(self): + for row in [("item1" "property1"), ("item2", "property2")]: + yield row + + if __name__ == '__main__': + task = SQLATask() + luigi.build([task], local_scheduler=True) + + +If the target table where the data needs to be copied already exists, then +the column schema definition can be skipped and instead the reflect flag +can be set as True. Here is a modified version of the above example: + +.. code-block:: python + + from sqlalchemy import String + import luigi + from luigi.contrib import sqla + + class SQLATask(sqla.CopyToTable): + # If database table is already created, then the schema can be loaded + # by setting the reflect flag to True + reflect = True + connection_string = "sqlite://" # in memory SQLite database + table = "item_property" # name of the table to store data + + def rows(self): + for row in [("item1" "property1"), ("item2", "property2")]: + yield row + + if __name__ == '__main__': + task = SQLATask() + luigi.build([task], local_scheduler=True) + + +In the above examples, the data that needs to be copied was directly provided by +overriding the rows method. Alternately, if the data comes from another task, the +modified example would look as shown below: + +.. code-block:: python + + from sqlalchemy import String + import luigi + from luigi.contrib import sqla + from luigi.mock import MockFile + + class BaseTask(luigi.Task): + def output(self): + return MockFile("BaseTask") + + def run(self): + out = self.output().open("w") + TASK_LIST = ["item%d\\tproperty%d\\n" % (i, i) for i in range(10)] + for task in TASK_LIST: + out.write(task) + out.close() + + class SQLATask(sqla.CopyToTable): + # columns defines the table schema, with each element corresponding + # to a column in the format (args, kwargs) which will be sent to + # the sqlalchemy.Column(*args, **kwargs) + columns = [ + (["item", String(64)], {"primary_key": True}), + (["property", String(64)], {}) + ] + connection_string = "sqlite://" # in memory SQLite database + table = "item_property" # name of the table to store data + + def requires(self): + return BaseTask() + + if __name__ == '__main__': + task1, task2 = SQLATask(), BaseTask() + luigi.build([task1, task2], local_scheduler=True) + + +In the above example, the output from `BaseTask` is copied into the +database. Here we did not have to implement the `rows` method because +by default `rows` implementation assumes every line is a row with +column values separated by a tab. One can define `column_separator` +option for the task if the values are say comma separated instead of +tab separated. + +The other option to `sqla.CopyToTable` that can be of help with performance aspect is the +`chunk_size`. The default is 5000. This is the number of rows that will be inserted in +a transaction at a time. Depending on the size of the inserts, this value can be tuned +for performance. + +Author: Gouthaman Balaraman +Date: 01/02/2015 +""" + + +import abc +import datetime +import itertools +import logging + +import luigi +import sqlalchemy + +logger = logging.getLogger('luigi-interface') + + +class SQLAlchemyTarget(luigi.Target): + """ + Database target using SQLAlchemy. + + This will rarely have to be directly instantiated by the user. + + Typical usage would be to override `luigi.contrib.sqla.CopyToTable` class + to create a task to write to the database. + """ + marker_table = None + + def __init__(self, connection_string, target_table, update_id, echo=False): + """ + Constructor for the SQLAlchemyTarget. + + :param connection_string: (str) SQLAlchemy connection string + :param target_table: (str) The table name for the data + :param update_id: (str) An identifier for this data set + :param echo: (bool) Flag to setup SQLAlchemy logging + :return: + """ + self.target_table = target_table + self.update_id = update_id + self.engine = sqlalchemy.create_engine(connection_string, echo=echo) + self.marker_table_bound = None + + def touch(self): + """ + Mark this update as complete. + """ + if self.marker_table_bound is None: + self.create_marker_table() + + table = self.marker_table_bound + with self.engine.begin() as conn: + id_exists = self.exists() + if not id_exists: + ins = table.insert().values(update_id=self.update_id, target_table=self.target_table) + else: + ins = table.update().values(update_id=self.update_id, target_table=self.target_table, + inserted=datetime.datetime.now()) + conn.execute(ins) + assert self.exists() + + def exists(self): + row = None + if self.marker_table_bound is None: + self.create_marker_table() + with self.engine.begin() as conn: + table = self.marker_table_bound + s = sqlalchemy.select([table]).where(table.c.update_id == self.update_id).limit(1) + row = conn.execute(s).fetchone() + return row is not None + + def create_marker_table(self): + """ + Create marker table if it doesn't exist. + + Using a separate connection since the transaction might have to be reset. + """ + if self.marker_table is None: + self.marker_table = luigi.configuration.get_config().get('sqlalchemy', 'marker-table', 'table_updates') + + with self.engine.begin() as con: + metadata = sqlalchemy.MetaData() + if not con.dialect.has_table(con, self.marker_table): + self.marker_table_bound = sqlalchemy.Table( + self.marker_table, metadata, + sqlalchemy.Column("update_id", sqlalchemy.String(128), primary_key=True), + sqlalchemy.Column("target_table", sqlalchemy.String(128)), + sqlalchemy.Column("inserted", sqlalchemy.DateTime, default=datetime.datetime.now())) + metadata.create_all(self.engine) + else: + metadata.reflect(bind=self.engine) + self.marker_table_bound = metadata.tables[self.marker_table] + + def open(self, mode): + raise NotImplementedError("Cannot open() SQLAlchemyTarget") + + +class CopyToTable(luigi.Task): + """ + An abstract task for inserting a data set into SQLAlchemy RDBMS + + Usage: + + * subclass and override the required `connection_string`, `table` and `columns` attributes. + """ + echo = False + + @abc.abstractmethod + def connection_string(self): + return None + + @abc.abstractproperty + def table(self): + return None + + # specify the columns that define the schema. The format for the columns is a list + # of tuples. For example : + # columns = [ + # (["id", sqlalchemy.Integer], dict(primary_key=True)), + # (["name", sqlalchemy.String(64)], {}), + # (["value", sqlalchemy.String(64)], {}) + # ] + # The tuple (args_list, kwargs_dict) here is the args and kwargs + # that need to be passed to sqlalchemy.Column(*args, **kwargs). + # If the tables have already been setup by another process, then you can + # completely ignore the columns. Instead set the reflect value to True below + columns = [] + + # options + column_separator = "\t" # how columns are separated in the file copied into postgres + chunk_size = 5000 # default chunk size for insert + reflect = False # Set this to true only if the table has already been created by alternate means + + def create_table(self, engine): + """ + Override to provide code for creating the target table. + + By default it will be created using types specified in columns. + If the table exists, then it binds to the existing table. + + If overridden, use the provided connection object for setting up the table in order to + create the table and insert data using the same transaction. + """ + def construct_sqla_columns(columns): + retval = [sqlalchemy.Column(*c[0], **c[1]) for c in columns] + return retval + + needs_setup = (len(self.columns) == 0) or (False in [len(c) == 2 for c in self.columns]) if not self.reflect else False + if needs_setup: + # only names of columns specified, no types + raise NotImplementedError("create_table() not implemented for %r and columns types not specified" % self.table) + else: + # if columns is specified as (name, type) tuples + with engine.begin() as con: + metadata = sqlalchemy.MetaData() + try: + if not con.dialect.has_table(con, self.table): + sqla_columns = construct_sqla_columns(self.columns) + self.table_bound = sqlalchemy.Table(self.table, metadata, *sqla_columns) + metadata.create_all(engine) + else: + metadata.reflect(bind=engine) + self.table_bound = metadata.tables[self.table] + except Exception as e: + logger.exception(self.table + str(e)) + + def update_id(self): + """ + This update id will be a unique identifier for this insert on this table. + """ + return self.task_id + + def output(self): + return SQLAlchemyTarget( + connection_string=self.connection_string, + target_table=self.table, + update_id=self.update_id(), + echo=self.echo + ) + + def rows(self): + """ + Return/yield tuples or lists corresponding to each row to be inserted. + + This method can be overridden for custom file types or formats. + """ + with self.input().open('r') as fobj: + for line in fobj: + yield line.strip("\n").split(self.column_separator) + + def run(self): + logger.info("Running task copy to table for update id %s for table %s" % (self.update_id(), self.table)) + output = self.output() + self.create_table(output.engine) + with output.engine.begin() as conn: + rows = iter(self.rows()) + ins_rows = [dict(zip((c.key for c in self.table_bound.c), row)) + for row in itertools.islice(rows, self.chunk_size)] + while ins_rows: + ins = self.table_bound.insert() + conn.execute(ins, ins_rows) + ins_rows = [dict(zip((c.key for c in self.table_bound.c), row)) + for row in itertools.islice(rows, self.chunk_size)] + logger.info("Finished inserting %d rows into SQLAlchemy target" % len(ins_rows)) + output.touch() + logger.info("Finished inserting rows into SQLAlchemy target") diff --git a/luigi/contrib/ssh.py b/luigi/contrib/ssh.py index 3a25dc7ac9..7c49d107c4 100644 --- a/luigi/contrib/ssh.py +++ b/luigi/contrib/ssh.py @@ -13,11 +13,14 @@ # the License. """ -Light-weight remote execution library and utilities +Light-weight remote execution library and utilities. -There are some examples in the unittest, but I added another more luigi-specific in the examples directory (examples/ssh_remote_execution.py +There are some examples in the unittest, but +I added another more luigi-specific in the examples directory (examples/ssh_remote_execution.py -contrib.ssh.RemoteContext is meant to provide functionality similar to that of the standard library subprocess module, but where the commands executed are run on a remote machine instead, without the user having to think about prefixing everything with "ssh" and credentials etc. +contrib.ssh.RemoteContext is meant to provide functionality similar to that of the standard library subprocess module, +but where the commands executed are run on a remote machine instead, +without the user having to think about prefixing everything with "ssh" and credentials etc. Using this mini library (which is just a convenience wrapper for subprocess), RemoteTarget is created to let you stream data from a remotely stored file using @@ -30,17 +33,18 @@ This can be super convenient when you want secure communication using a non-secure protocol or circumvent firewalls (as long as they are open for ssh traffic). """ +import contextlib import os import random +import subprocess import luigi -import luigi.target import luigi.format -import subprocess -import contextlib +import luigi.target class RemoteContext(object): + def __init__(self, host, username=None, key_file=None, connect_timeout=None): self.host = host self.username = username @@ -77,14 +81,17 @@ def _prepare_cmd(self, cmd): return connection_cmd + cmd def Popen(self, cmd, **kwargs): - """ Remote Popen """ + """ + Remote Popen. + """ prefixed_cmd = self._prepare_cmd(cmd) return subprocess.Popen(prefixed_cmd, **kwargs) def check_output(self, cmd): - """ Execute a shell command remotely and return the output + """ + Execute a shell command remotely and return the output. - Simplified version of Popen when you only want the output as a string and detect any errors + Simplified version of Popen when you only want the output as a string and detect any errors. """ p = self.Popen(cmd, stdout=subprocess.PIPE) output, _ = p.communicate() @@ -94,7 +101,8 @@ def check_output(self, cmd): @contextlib.contextmanager def tunnel(self, local_port, remote_port=None, remote_host="localhost"): - """ Open a tunnel between localhost:local_port and remote_host:remote_port via the host specified by this context + """ + Open a tunnel between localhost:local_port and remote_host:remote_port via the host specified by this context. Remember to close() the returned "tunnel" object in order to clean up after yourself when you are done with the tunnel. @@ -115,14 +123,17 @@ def tunnel(self, local_port, remote_port=None, remote_host="localhost"): class RemoteFileSystem(luigi.target.FileSystem): + def __init__(self, host, username=None, key_file=None): self.remote_context = RemoteContext(host, username, key_file) def exists(self, path): - """ Return `True` if file or directory at `path` exist, False otherwise """ + """ + Return `True` if file or directory at `path` exist, False otherwise. + """ try: self.remote_context.check_output(["test", "-e", path]) - except subprocess.CalledProcessError, e: + except subprocess.CalledProcessError as e: if e.returncode == 1: return False else: @@ -130,7 +141,9 @@ def exists(self, path): return True def remove(self, path, recursive=True): - """ Remove file or directory at location `path` """ + """ + Remove file or directory at location `path`. + """ if recursive: cmd = ["rm", "-r", path] else: @@ -172,6 +185,7 @@ def get(self, path, local_path): class AtomicRemoteFileWriter(luigi.format.OutputPipeProcessWrapper): + def __init__(self, fs, path): self._fs = fs self.path = path @@ -206,11 +220,13 @@ def fs(self): class RemoteTarget(luigi.target.FileSystemTarget): """ - Target used for reading from remote files. The target is implemented using - ssh commands streaming data over the network. + Target used for reading from remote files. + + The target is implemented using ssh commands streaming data over the network. """ + def __init__(self, path, host, format=None, username=None, key_file=None): - self.path = path + super(RemoteTarget, self).__init__(path) self.format = format self._fs = RemoteFileSystem(host, username, key_file) diff --git a/luigi/contrib/target.py b/luigi/contrib/target.py index 3d2bbaeadf..32b1028df1 100644 --- a/luigi/contrib/target.py +++ b/luigi/contrib/target.py @@ -1,12 +1,16 @@ -import luigi.target import logging import types + +import luigi.target + logger = logging.getLogger('luigi-interface') -class CascadingClient(): + +class CascadingClient(object): """ - A FilesystemClient that will cascade failing function calls through a list - of clients. Which clients are used are specified at time of construction. + A FilesystemClient that will cascade failing function calls through a list of clients. + + Which clients are used are specified at time of construction. """ # This constant member is supposed to include all methods, feel free to add @@ -14,10 +18,14 @@ class CascadingClient(): # created, pass the kwarg to the constructor. ALL_METHOD_NAMES = ['exists', 'rename', 'remove', 'chmod', 'chown', 'count', 'copy', 'get', 'put', 'mkdir', 'listdir', - 'isdir'] + 'isdir', + 'rename_dont_move', + ] - def __init__(self, clients, method_names=ALL_METHOD_NAMES): + def __init__(self, clients, method_names=None): self.clients = clients + if method_names is None: + method_names = self.ALL_METHOD_NAMES for method_name in method_names: new_method = self._make_method(method_name) @@ -47,4 +55,4 @@ def _chained_call(self, method_name, *args, **kwargs): logger.exception( 'The {0} failed to {1}, using fallback class {2}' .format(client.__class__.__name__, method_name, - self.clients[i+1].__class__.__name__)) + self.clients[i + 1].__class__.__name__)) diff --git a/luigi/contrib/webhdfs.py b/luigi/contrib/webhdfs.py index d50bde08dc..ac1e242cd4 100644 --- a/luigi/contrib/webhdfs.py +++ b/luigi/contrib/webhdfs.py @@ -4,9 +4,9 @@ """ from __future__ import absolute_import +import logging import os import random -import logging import tempfile from luigi import configuration @@ -39,6 +39,7 @@ def open(self, mode='r'): class ReadableWebHdfsFile(object): + def __init__(self, path, client): self.path = path self.client = client @@ -77,6 +78,7 @@ class AtomicWebHdfsFile(file): """ An Hdfs file that writes to a temp file and put to WebHdfs on close. """ + def __init__(self, path, client): unique_name = 'luigi-webhdfs-tmp-%09d' % random.randrange(0, 1e10) self.tmp_path = os.path.join(tempfile.gettempdir(), unique_name) @@ -93,13 +95,17 @@ def __enter__(self): return self def __exit__(self, exc_type, exc, traceback): - """Close/commit the file if there are no exception""" + """ + Close/commit the file if there are no exception. + """ if exc_type: return return file.__exit__(self, exc_type, exc, traceback) def __del__(self): - """Remove the temporary directory""" + """ + Remove the temporary directory. + """ if os.path.exists(self.tmp_path): os.remove(self.tmp_path) @@ -126,11 +132,13 @@ def walk(self, path, depth=1): return self.webhdfs.walk(path, depth=depth) def exists(self, path): - """Returns true if the path exists and false otherwise""" + """ + Returns true if the path exists and false otherwise. + """ try: self.webhdfs.status(path) return True - except webhdfs.util.HdfsError, e: + except webhdfs.util.HdfsError as e: if str(e).startswith('File does not exist: '): return False else: diff --git a/luigi/date_interval.py b/luigi/date_interval.py index f4daafd5de..d166ef5bcb 100644 --- a/luigi/date_interval.py +++ b/luigi/date_interval.py @@ -12,11 +12,12 @@ # License for the specific language governing permissions and limitations under # the License. -import re import datetime +import re class DateInterval(object): + def __init__(self, date_a, date_b): # Represents all date d such that date_a <= d < date_b self.date_a = date_a @@ -52,11 +53,11 @@ def to_string(self): raise NotImplementedError @classmethod - def from_date(self, d): + def from_date(cls, d): raise NotImplementedError @classmethod - def parse(self, s): + def parse(cls, s): raise NotImplementedError def __contains__(self, date): @@ -70,7 +71,7 @@ def __hash__(self): return hash(repr(self)) def __cmp__(self, other): - if type(self) != type(other): + if not isinstance(self, type(other)): # doing this because it's not well defined if eg. 2012-01-01-2013-01-01 == 2012 raise TypeError('Date interval type mismatch') return cmp((self.date_a, self.date_b), (other.date_a, other.date_b)) @@ -86,6 +87,7 @@ def __ne__(self, other): class Date(DateInterval): + def __init__(self, y, m, d): a = datetime.date(y, m, d) b = datetime.date(y, m, d) + datetime.timedelta(1) @@ -95,16 +97,17 @@ def to_string(self): return self.date_a.strftime('%Y-%m-%d') @classmethod - def from_date(self, d): + def from_date(cls, d): return Date(d.year, d.month, d.day) @classmethod - def parse(self, s): + def parse(cls, s): if re.match(r'\d\d\d\d\-\d\d\-\d\d$', s): return Date(*map(int, s.split('-'))) class Week(DateInterval): + def __init__(self, y, w): # Python datetime does not have a method to convert from ISO weeks! for d in xrange(-10, 370): @@ -121,17 +124,18 @@ def to_string(self): return '%d-W%02d' % self.date_a.isocalendar()[:2] @classmethod - def from_date(self, d): + def from_date(cls, d): return Week(*d.isocalendar()[:2]) @classmethod - def parse(self, s): + def parse(cls, s): if re.match(r'\d\d\d\d\-W\d\d$', s): y, w = map(int, s.split('-W')) return Week(y, w) class Month(DateInterval): + def __init__(self, y, m): date_a = datetime.date(y, m, 1) date_b = datetime.date(y + m / 12, 1 + m % 12, 1) @@ -141,17 +145,18 @@ def to_string(self): return self.date_a.strftime('%Y-%m') @classmethod - def from_date(self, d): + def from_date(cls, d): return Month(d.year, d.month) @classmethod - def parse(self, s): + def parse(cls, s): if re.match(r'\d\d\d\d\-\d\d$', s): y, m = map(int, s.split('-')) return Month(y, m) class Year(DateInterval): + def __init__(self, y): date_a = datetime.date(y, 1, 1) date_b = datetime.date(y + 1, 1, 1) @@ -161,21 +166,22 @@ def to_string(self): return self.date_a.strftime('%Y') @classmethod - def from_date(self, d): + def from_date(cls, d): return Year(d.year) @classmethod - def parse(self, s): + def parse(cls, s): if re.match(r'\d\d\d\d$', s): return Year(int(s)) class Custom(DateInterval): + def to_string(self): return '-'.join([d.strftime('%Y-%m-%d') for d in (self.date_a, self.date_b)]) @classmethod - def parse(self, s): + def parse(cls, s): if re.match('\d\d\d\d\-\d\d\-\d\d\-\d\d\d\d\-\d\d\-\d\d$', s): # Actually the ISO 8601 specifies / as the time interval format # Not sure if this goes for date intervals as well. In any case slashes will diff --git a/luigi/db_task_history.py b/luigi/db_task_history.py index 17279158ad..8294c5e498 100644 --- a/luigi/db_task_history.py +++ b/luigi/db_task_history.py @@ -12,26 +12,27 @@ # License for the specific language governing permissions and limitations under # the License. -import task_history -import configuration import datetime import logging - from contextlib import contextmanager -from task_status import PENDING, FAILED, DONE, RUNNING -from sqlalchemy.orm.collections import attribute_mapped_collection -from sqlalchemy import Column, Integer, String, ForeignKey, TIMESTAMP, create_engine -from sqlalchemy.ext.declarative import declarative_base -from sqlalchemy.orm import sessionmaker, relationship +import configuration +import sqlalchemy +import sqlalchemy.ext.declarative +import sqlalchemy.orm +import sqlalchemy.orm.collections +import task_history +from task_status import DONE, FAILED, PENDING, RUNNING -Base = declarative_base() +Base = sqlalchemy.ext.declarative.declarative_base() logger = logging.getLogger('luigi-interface') class DbTaskHistory(task_history.TaskHistory): - """ Task History that writes to a database using sqlalchemy. Also has methods for useful db queries + """ + Task History that writes to a database using sqlalchemy. + Also has methods for useful db queries. """ @contextmanager def _session(self, session=None): @@ -50,8 +51,8 @@ def _session(self, session=None): def __init__(self): config = configuration.get_config() connection_string = config.get('task_history', 'db_connection') - self.engine = create_engine(connection_string) - self.session_factory = sessionmaker(bind=self.engine, expire_on_commit=False) + self.engine = sqlalchemy.create_engine(connection_string) + self.session_factory = sqlalchemy.orm.sessionmaker(bind=self.engine, expire_on_commit=False) Base.metadata.create_all(self.engine) self.tasks = {} # task_id -> TaskRecord @@ -101,8 +102,9 @@ def _find_or_create_task(self, task): task.record_id = task_record.id def find_all_by_parameters(self, task_name, session=None, **task_params): - ''' Find tasks with the given task_name and the same parameters as the kwargs - ''' + """ + Find tasks with the given task_name and the same parameters as the kwargs. + """ with self._session(session) as session: tasks = session.query(TaskRecord).join(TaskEvent).filter(TaskRecord.name == task_name).order_by(TaskEvent.ts).all() for task in tasks: @@ -110,13 +112,15 @@ def find_all_by_parameters(self, task_name, session=None, **task_params): yield task def find_all_by_name(self, task_name, session=None): - ''' Find all tasks with the given task_name - ''' + """ + Find all tasks with the given task_name. + """ return self.find_all_by_parameters(task_name, session) def find_latest_runs(self, session=None): - ''' Return tasks that have been updated in the past 24 hours. - ''' + """ + Return tasks that have been updated in the past 24 hours. + """ with self._session(session) as session: yesterday = datetime.datetime.now() - datetime.timedelta(days=1) return session.query(TaskRecord).\ @@ -127,48 +131,58 @@ def find_latest_runs(self, session=None): all() def find_task_by_id(self, id, session=None): - ''' Find task with the given record ID - ''' + """ + Find task with the given record ID. + """ with self._session(session) as session: return session.query(TaskRecord).get(id) class TaskParameter(Base): - """ Table to track luigi.Parameter()s of a Task + """ + Table to track luigi.Parameter()s of a Task. """ __tablename__ = 'task_parameters' - task_id = Column(Integer, ForeignKey('tasks.id'), primary_key=True) - name = Column(String(128), primary_key=True) - value = Column(String(256)) + task_id = sqlalchemy.Column(sqlalchemy.Integer, sqlalchemy.ForeignKey('tasks.id'), primary_key=True) + name = sqlalchemy.Column(sqlalchemy.String(128), primary_key=True) + value = sqlalchemy.Column(sqlalchemy.String(256)) def __repr__(self): return "TaskParameter(task_id=%d, name=%s, value=%s)" % (self.task_id, self.name, self.value) class TaskEvent(Base): - """ Table to track when a task is scheduled, starts, finishes, and fails + """ + Table to track when a task is scheduled, starts, finishes, and fails. """ __tablename__ = 'task_events' - id = Column(Integer, primary_key=True) - task_id = Column(Integer, ForeignKey('tasks.id')) - event_name = Column(String(20)) - ts = Column(TIMESTAMP, index=True) + id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True) + task_id = sqlalchemy.Column(sqlalchemy.Integer, sqlalchemy.ForeignKey('tasks.id')) + event_name = sqlalchemy.Column(sqlalchemy.String(20)) + ts = sqlalchemy.Column(sqlalchemy.TIMESTAMP, index=True) def __repr__(self): return "TaskEvent(task_id=%s, event_name=%s, ts=%s" % (self.task_id, self.event_name, self.ts) class TaskRecord(Base): - """ Base table to track information about a luigi.Task. References to other tables are available through - task.events, task.parameters, etc. + """ + Base table to track information about a luigi.Task. + + References to other tables are available through task.events, task.parameters, etc. """ __tablename__ = 'tasks' - id = Column(Integer, primary_key=True) - name = Column(String(128), index=True) - host = Column(String(128)) - parameters = relationship('TaskParameter', collection_class=attribute_mapped_collection('name'), - cascade="all, delete-orphan") - events = relationship("TaskEvent", order_by=lambda: TaskEvent.ts.desc(), backref="task") + id = sqlalchemy.Column(sqlalchemy.Integer, primary_key=True) + name = sqlalchemy.Column(sqlalchemy.String(128), index=True) + host = sqlalchemy.Column(sqlalchemy.String(128)) + parameters = sqlalchemy.orm.relationship( + 'TaskParameter', + collection_class=sqlalchemy.orm.collections.attribute_mapped_collection('name'), + cascade="all, delete-orphan") + events = sqlalchemy.orm.relationship( + 'TaskEvent', + order_by=lambda: TaskEvent.ts.desc(), + backref='task') def __repr__(self): return "TaskRecord(name=%s, host=%s)" % (self.name, self.host) diff --git a/luigi/deprecate_kwarg.py b/luigi/deprecate_kwarg.py new file mode 100644 index 0000000000..52bb301ae9 --- /dev/null +++ b/luigi/deprecate_kwarg.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +import warnings + + +def deprecate_kwarg(old_name, new_name, kw_value): + """ + Rename keyword arguments, but keep backwards compatibility. + + Usage: + + .. code-block: python + + >>> @deprecate_kwarg('old', 'new', 'defval') + ... def some_func(old='defval'): + ... print(old) + ... + >>> some_func(new='yay') + yay + >>> some_func(old='yaay') + yaay + >>> some_func() + defval + + """ + def real_decorator(function): + def new_function(*args, **kwargs): + value = kw_value + if old_name in kwargs: + warnings.warn('Keyword argument {0} is deprecated, use {1}' + .format(old_name, new_name)) + value = kwargs[old_name] + if new_name in kwargs: + value = kwargs[new_name] + del kwargs[new_name] + kwargs[old_name] = value + return function(*args, **kwargs) + return new_function + return real_decorator diff --git a/luigi/event.py b/luigi/event.py index cbb1ed40f2..dd2e15e28b 100644 --- a/luigi/event.py +++ b/luigi/event.py @@ -12,7 +12,8 @@ # License for the specific language governing permissions and limitations under # the License. -class Event: + +class Event(object): # TODO nice descriptive subclasses of Event instead of strings? pass their instances to the callback instead of an undocumented arg list? DEPENDENCY_DISCOVERED = "event.core.dependency.discovered" # triggered for every (task, upstream task) pair discovered in a jobflow DEPENDENCY_MISSING = "event.core.dependency.missing" @@ -22,5 +23,3 @@ class Event: FAILURE = "event.core.failure" SUCCESS = "event.core.success" PROCESSING_TIME = "event.core.processing_time" - - diff --git a/luigi/file.py b/luigi/file.py index 5b57826f9f..0d22e42cfc 100644 --- a/luigi/file.py +++ b/luigi/file.py @@ -14,16 +14,18 @@ import os import random -import tempfile import shutil +import tempfile + import luigi.util -from target import FileSystem, FileSystemTarget from luigi.format import FileWrapper +from target import FileSystem, FileSystemTarget class atomic_file(file): # Simple class that writes to a temp file and moves it on close() # Also cleans up the temp file if close is not invoked + def __init__(self, path): self.__tmp_path = path + '-luigi-tmp-%09d' % random.randrange(0, 1e10) self.path = path @@ -49,14 +51,16 @@ def __exit__(self, exc_type, exc, traceback): class LocalFileSystem(FileSystem): - """ Wrapper for access to file system operations + """ + Wrapper for access to file system operations. - Work in progress - add things as needed + Work in progress - add things as needed. """ + def exists(self, path): return os.path.exists(path) - def mkdir(self, path): + def mkdir(self, path, parents=True, raise_if_exists=False): os.makedirs(path) def isdir(self, path): @@ -81,14 +85,18 @@ def __init__(self, path=None, format=None, is_tmp=False): self.format = format self.is_tmp = is_tmp + def makedirs(self): + """ + Create all parent folders if they do not exist. + """ + normpath = os.path.normpath(self.path) + parentfolder = os.path.dirname(normpath) + if parentfolder and not os.path.exists(parentfolder): + os.makedirs(parentfolder) + def open(self, mode='r'): if mode == 'w': - # Create folder if it does not exist - normpath = os.path.normpath(self.path) - parentfolder = os.path.dirname(normpath) - if parentfolder and not os.path.exists(parentfolder): - os.makedirs(parentfolder) - + self.makedirs() if self.format: return self.format.pipe_writer(atomic_file(self.path)) else: @@ -122,7 +130,7 @@ def copy(self, new_path, fail_if_exists=False): if fail_if_exists and os.path.exists(new_path): raise RuntimeError('Destination exists: %s' % new_path) tmp = File(new_path + '-luigi-tmp-%09d' % random.randrange(0, 1e10), is_tmp=True) - tmp.open('w') + tmp.makedirs() shutil.copy(self.path, tmp.fn) tmp.move(new_path) diff --git a/luigi/format.py b/luigi/format.py index e12a64ef52..522e9b7174 100644 --- a/luigi/format.py +++ b/luigi/format.py @@ -12,12 +12,13 @@ # License for the specific language governing permissions and limitations under # the License. -import subprocess import signal +import subprocess class FileWrapper(object): - """Wrap `file` in a "real" so stuff can be added to it after creation + """ + Wrap `file` in a "real" so stuff can be added to it after creation. """ def __init__(self, file_object): @@ -42,12 +43,15 @@ def __iter__(self): class InputPipeProcessWrapper(object): + def __init__(self, command, input_pipe=None): - ''' - @param command a subprocess.Popen instance with stdin=input_pipe and - stdout=subprocess.PIPE. Alternatively, just its args argument as a - convenience. - ''' + """ + Initializes a InputPipeProcessWrapper instance. + + :param command: a subprocess.Popen instance with stdin=input_pipe and + stdout=subprocess.PIPE. + Alternatively, just its args argument as a convenience. + """ self._command = command self._input_pipe = input_pipe self._process = command if isinstance(command, subprocess.Popen) else self.create_subprocess(command) @@ -94,7 +98,9 @@ def __enter__(self): return self def _abort(self): - "Call _finish, but eat the exception (if any)." + """ + Call _finish, but eat the exception (if any). + """ try: self._finish() except KeyboardInterrupt: @@ -144,7 +150,9 @@ def writeLine(self, line): self.write(line + '\n') def _finish(self): - """ Closes and waits for subprocess to exit """ + """ + Closes and waits for subprocess to exit. + """ if self._process.returncode is None: self._process.stdin.flush() self._process.stdin.close() @@ -182,7 +190,9 @@ def __getattr__(self, name): class Format(object): - """ Interface for format specifications """ + """ + Interface for format specifications. + """ # TODO Move this to somewhere else? @classmethod @@ -204,6 +214,7 @@ def pipe_writer(cls, output_pipe): class Gzip(Format): + @classmethod def pipe_reader(cls, input_pipe): return InputPipeProcessWrapper(['gunzip'], input_pipe) @@ -214,6 +225,7 @@ def pipe_writer(cls, output_pipe): class Bzip2(Format): + @classmethod def pipe_reader(cls, input_pipe): return InputPipeProcessWrapper(['bzcat'], input_pipe) @@ -221,4 +233,3 @@ def pipe_reader(cls, input_pipe): @classmethod def pipe_writer(cls, output_pipe): return OutputPipeProcessWrapper(['bzip2'], output_pipe) - diff --git a/luigi/hadoop.py b/luigi/hadoop.py index e3caea97a0..3bd80b654b 100644 --- a/luigi/hadoop.py +++ b/luigi/hadoop.py @@ -12,28 +12,30 @@ # License for the specific language governing permissions and limitations under # the License. -import random -import sys -import os -import datetime -import subprocess -import tempfile -from itertools import groupby -import pickle +import abc import binascii +import datetime +import glob +import json import logging -import StringIO +import os +import pickle +import random import re import shutil import signal +import StringIO +import subprocess +import sys +import tempfile +import warnings from hashlib import md5 +from itertools import groupby + +import configuration import luigi import luigi.hdfs -import configuration -import warnings import mrrunner -import json -import glob logger = logging.getLogger('luigi-interface') @@ -41,16 +43,19 @@ def attach(*packages): - """ Attach a python package to hadoop map reduce tarballs to make those packages available on the hadoop cluster""" + """ + Attach a python package to hadoop map reduce tarballs to make those packages available + on the hadoop cluster. + """ _attached_packages.extend(packages) -def dereference(file): - if os.path.islink(file): - #by joining with the dirname we are certain to get the absolute path - return dereference(os.path.join(os.path.dirname(file), os.readlink(file))) +def dereference(f): + if os.path.islink(f): + # by joining with the dirname we are certain to get the absolute path + return dereference(os.path.join(os.path.dirname(f), os.readlink(f))) else: - return file + return f def get_extra_files(extra_files): @@ -66,8 +71,8 @@ def get_extra_files(extra_files): if os.path.isdir(src): src_prefix = os.path.join(src, '') for base, dirs, files in os.walk(src): - for file in files: - f_src = os.path.join(base, file) + for f in files: + f_src = os.path.join(base, f) f_src_stripped = f_src[len(src_prefix):] f_dst = os.path.join(dst, f_src_stripped) result.append((f_src, f_dst)) @@ -78,7 +83,9 @@ def get_extra_files(extra_files): def create_packages_archive(packages, filename): - """Create a tar archive which will contain the files for the packages listed in packages. """ + """ + Create a tar archive which will contain the files for the packages listed in packages. + """ import tarfile tar = tarfile.open(filename, "w") @@ -149,11 +156,15 @@ def add_files_for_package(sub_package_path, root_package_path, root_package_name def flatten(sequence): - """A simple generator which flattens a sequence. + """ + A simple generator which flattens a sequence. + + Only one level is flattened. + + .. code-block:: python - Only one level is flattned. + (1, (2, 3), 4) -> (1, 2, 3, 4) - (1, (2, 3), 4) -> (1, 2, 3, 4) """ for item in sequence: if hasattr(item, "__iter__"): @@ -164,6 +175,7 @@ def flatten(sequence): class HadoopRunContext(object): + def __init__(self): self.job_id = None @@ -187,6 +199,7 @@ def __exit__(self, exc_type, exc_val, exc_tb): class HadoopJobError(RuntimeError): + def __init__(self, message, out=None, err=None): super(HadoopJobError, self).__init__(message, out, err) self.message = message @@ -195,17 +208,26 @@ def __init__(self, message, out=None, err=None): def run_and_track_hadoop_job(arglist, tracking_url_callback=None, env=None): - ''' Runs the job by invoking the command from the given arglist. Finds tracking urls from the output and attempts to fetch - errors using those urls if the job fails. Throws HadoopJobError with information about the error (including stdout and stderr - from the process) on failure and returns normally otherwise. - ''' + """ + Runs the job by invoking the command from the given arglist. + Finds tracking urls from the output and attempts to fetch errors using those urls if the job fails. + Throws HadoopJobError with information about the error + (including stdout and stderr from the process) + on failure and returns normally otherwise. + + :param arglist: + :param tracking_url_callback: + :param env: + :return: + """ logger.info('%s', ' '.join(arglist)) def write_luigi_history(arglist, history): - ''' + """ Writes history to a file in the job's output directory in JSON format. - Currently just for tracking the job ID in a configuration where no history is stored in the output directory by Hadoop. - ''' + Currently just for tracking the job ID in a configuration where + no history is stored in the output directory by Hadoop. + """ history_filename = configuration.get_config().get('core', 'history-filename', '') if history_filename and '-output' in arglist: output_dir = arglist[arglist.index('-output') + 1] @@ -263,7 +285,7 @@ def track_process(arglist, tracking_url_callback, env=None): try: task_failures = fetch_task_failures(tracking_url) - except Exception, e: + except Exception as e: raise HadoopJobError(message + 'Additionally, an error occurred when fetching data from %s: %s' % (tracking_url, e), out, err) @@ -279,14 +301,16 @@ def track_process(arglist, tracking_url_callback, env=None): def fetch_task_failures(tracking_url): - ''' Uses mechanize to fetch the actual task logs from the task tracker. + """ + Uses mechanize to fetch the actual task logs from the task tracker. - This is highly opportunistic, and we might not succeed. So we set a low timeout and hope it works. + This is highly opportunistic, and we might not succeed. + So we set a low timeout and hope it works. If it does not, it's not the end of the world. TODO: Yarn has a REST API that we should probably use instead: - http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/MapredAppMasterRest.html - ''' + http://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/WebServicesIntro.html + """ import mechanize timeout = 3.0 failures_url = tracking_url.replace('jobdetails.jsp', 'jobfailures.jsp') + '&cause=failed' @@ -303,7 +327,7 @@ def fetch_task_failures(tracking_url): try: r = b2.open(task_url, timeout=timeout) data = r.read() - except Exception, e: + except Exception as e: logger.debug('Error fetching data from %s: %s', task_url, e) continue # Try to get the hex-encoded traceback back from the output @@ -319,17 +343,21 @@ class JobRunner(object): class HadoopJobRunner(JobRunner): - ''' Takes care of uploading & executing a Hadoop job using Hadoop streaming + """ + Takes care of uploading & executing a Hadoop job using Hadoop streaming. TODO: add code to support Elastic Mapreduce (using boto) and local execution. - ''' - def __init__(self, streaming_jar, modules=[], streaming_args=[], libjars=[], libjars_in_hdfs=[], jobconfs={}, input_format=None, output_format=None): + """ + + def __init__(self, streaming_jar, modules=None, streaming_args=None, libjars=None, libjars_in_hdfs=None, jobconfs=None, input_format=None, output_format=None): + def get(x, default): + return x is not None and x or default self.streaming_jar = streaming_jar - self.modules = modules - self.streaming_args = streaming_args - self.libjars = libjars - self.libjars_in_hdfs = libjars_in_hdfs - self.jobconfs = jobconfs + self.modules = get(modules, []) + self.streaming_args = get(streaming_args, []) + self.libjars = get(libjars, []) + self.libjars_in_hdfs = get(libjars_in_hdfs, []) + self.jobconfs = get(jobconfs, {}) self.input_format = input_format self.output_format = output_format self.tmp_dir = False @@ -348,10 +376,10 @@ def run_job(self, job): base_tmp_dir = configuration.get_config().get('core', 'tmp-dir', None) if base_tmp_dir: - warnings.warn("The core.tmp-dir configuration item is"\ - " deprecated, please use the TMPDIR"\ - " environment variable if you wish"\ - " to control where luigi.hadoop may"\ + warnings.warn("The core.tmp-dir configuration item is" + " deprecated, please use the TMPDIR" + " environment variable if you wish" + " to control where luigi.hadoop may" " create temporary files and directories.") self.tmp_dir = os.path.join(base_tmp_dir, 'hadoop_job_%016x' % random.getrandbits(64)) os.makedirs(self.tmp_dir) @@ -370,7 +398,7 @@ def run_job(self, job): # replace output with a temporary work directory output_final = job.output().path output_tmp_fn = output_final + '-temp-' + datetime.datetime.now().isoformat().replace(':', '-') - tmp_target = luigi.hdfs.HdfsTarget(output_tmp_fn, is_tmp=True) + tmp_target = luigi.hdfs.HdfsTarget(output_tmp_fn) arglist = luigi.hdfs.load_hadoop_cmd() + ['jar', self.streaming_jar] @@ -392,7 +420,7 @@ def run_job(self, job): dst_tmp = '%s_%09d' % (dst.replace('/', '_'), random.randint(0, 999999999)) files += ['%s#%s' % (src, dst_tmp)] # -files doesn't support subdirectories, so we need to create the dst_tmp -> dst manually - job._add_link(dst_tmp, dst) + job.add_link(dst_tmp, dst) if files: arglist += ['-files', ','.join(files)] @@ -434,11 +462,11 @@ def run_job(self, job): # submit job create_packages_archive(packages, self.tmp_dir + '/packages.tar') - job._dump(self.tmp_dir) + job.dump(self.tmp_dir) run_and_track_hadoop_job(arglist) - tmp_target.move(output_final, raise_if_exists=True) + tmp_target.move_dir(output_final) self.finish() def finish(self): @@ -452,7 +480,10 @@ def __del__(self): class DefaultHadoopJobRunner(HadoopJobRunner): - ''' The default job runner just reads from config and sets stuff ''' + """ + The default job runner just reads from config and sets stuff. + """ + def __init__(self): config = configuration.get_config() streaming_jar = config.get('hadoop', 'streaming-jar') @@ -461,29 +492,31 @@ def __init__(self): class LocalJobRunner(JobRunner): - ''' Will run the job locally + """ + Will run the job locally. This is useful for debugging and also unit testing. Tries to mimic Hadoop Streaming. TODO: integrate with JobTask - ''' + """ + def __init__(self, samplelines=None): self.samplelines = samplelines - def sample(self, input, n, output): - for i, line in enumerate(input): + def sample(self, input_stream, n, output): + for i, line in enumerate(input_stream): if n is not None and i >= n: break output.write(line) - def group(self, input): + def group(self, input_stream): output = StringIO.StringIO() lines = [] - for i, line in enumerate(input): + for i, line in enumerate(input_stream): parts = line.rstrip('\n').split('\t') blob = md5(str(i)).hexdigest() # pseudo-random blob to make sure the input isn't sorted lines.append((parts[:-1], blob, line)) - for k, _, line in sorted(lines): + for _, _, line in sorted(lines): output.write(line) output.seek(0) return output @@ -499,14 +532,14 @@ def run_job(self, job): if job.reducer == NotImplemented: # Map only job; no combiner, no reducer map_output = job.output().open('w') - job._run_mapper(map_input, map_output) + job.run_mapper(map_input, map_output) map_output.close() return job.init_mapper() # run job now... map_output = StringIO.StringIO() - job._run_mapper(map_input, map_output) + job.run_mapper(map_input, map_output) map_output.seek(0) if job.combiner == NotImplemented: @@ -514,13 +547,13 @@ def run_job(self, job): else: combine_input = self.group(map_output) combine_output = StringIO.StringIO() - job._run_combiner(combine_input, combine_output) + job.run_combiner(combine_input, combine_output) combine_output.seek(0) reduce_input = self.group(combine_output) job.init_reducer() reduce_output = job.output().open('w') - job._run_reducer(reduce_input, reduce_output) + job.run_reducer(reduce_input, reduce_output) reduce_output.close() @@ -538,6 +571,10 @@ class BaseHadoopJobTask(luigi.Task): _counter_dict = {} task_id = None + @abc.abstractmethod + def job_runner(self): + pass + def jobconfs(self): jcs = [] jcs.append('mapred.job.name=%s' % self.task_id) @@ -553,14 +590,14 @@ def jobconfs(self): jcs.append('mapred.job.queue.name=%s' % pool) return jcs - def init_local(self): - ''' Implement any work to setup any internal datastructure etc here. + """ + Implement any work to setup any internal datastructure etc here. You can add extra input using the requires_local/input_local methods. Anything you set on the object will be pickled and available on the Hadoop nodes. - ''' + """ pass def init_hadoop(self): @@ -571,7 +608,9 @@ def run(self): self.job_runner().run_job(self) def requires_local(self): - ''' Default impl - override this method if you need any local input to be accessible in init() ''' + """ + Default impl - override this method if you need any local input to be accessible in init(). + """ return [] def requires_hadoop(self): @@ -628,9 +667,12 @@ def _setup_remote(self): def job_runner(self): # We recommend that you define a subclass, override this method and set up your own config - """ Get the MapReduce runner for this job + """ + Get the MapReduce runner for this job. - If all outputs are HdfsTargets, the DefaultHadoopJobRunner will be used. Otherwise, the LocalJobRunner which streams all data through the local machine will be used (great for testing). + If all outputs are HdfsTargets, the DefaultHadoopJobRunner will be used. + Otherwise, the LocalJobRunner which streams all data through the local machine + will be used (great for testing). """ outputs = luigi.task.flatten(self.output()) for output in outputs: @@ -642,15 +684,20 @@ def job_runner(self): return DefaultHadoopJobRunner() def reader(self, input_stream): - """Reader is a method which iterates over input lines and outputs records. - The default implementation yields one argument containing the line for each line in the input.""" + """ + Reader is a method which iterates over input lines and outputs records. + + The default implementation yields one argument containing the line for each line in the input.""" for line in input_stream: yield line, def writer(self, outputs, stdout, stderr=sys.stderr): - """Writer format is a method which iterates over the output records from the reducer and formats - them for output. - The default implementation outputs tab separated items""" + """ + Writer format is a method which iterates over the output records + from the reducer and formats them for output. + + The default implementation outputs tab separated items. + """ for output in outputs: try: print >> stdout, "\t".join(map(str, flatten(output))) @@ -659,15 +706,18 @@ def writer(self, outputs, stdout, stderr=sys.stderr): raise def mapper(self, item): - """Re-define to process an input item (usually a line of input data) + """ + Re-define to process an input item (usually a line of input data). - Defaults to identity mapper that sends all lines to the same reducer""" + Defaults to identity mapper that sends all lines to the same reducer. + """ yield None, item combiner = NotImplemented def incr_counter(self, *args, **kwargs): - """ Increments a Hadoop counter + """ + Increments a Hadoop counter. Since counters can be a bit slow to update, this batches the updates. """ @@ -683,13 +733,14 @@ def incr_counter(self, *args, **kwargs): ct = self._counter_dict.get(key, 0) ct += count if ct >= threshold: - new_arg = list(key)+[ct] + new_arg = list(key) + [ct] self._incr_counter(*new_arg) ct = 0 self._counter_dict[key] = ct def _flush_batch_incr_counter(self): - """ Increments any unflushed counter values + """ + Increments any unflushed counter values. """ for key, count in self._counter_dict.iteritems(): if count == 0: @@ -698,9 +749,12 @@ def _flush_batch_incr_counter(self): self._incr_counter(*args) def _incr_counter(self, *args): - """ Increments a Hadoop counter + """ + Increments a Hadoop counter. + + Note that this seems to be a bit slow, ~1 ms - Note that this seems to be a bit slow, ~1 ms. Don't overuse this function by updating very frequently. + Don't overuse this function by updating very frequently. """ if len(args) == 2: # backwards compatibility with existing hadoop jobs @@ -714,15 +768,19 @@ def extra_modules(self): return [] # can be overridden in subclass def extra_files(self): - ''' - Can be overriden in subclass. Each element is either a string, or a pair of two strings (src, dst). - src can be a directory (in which case everything will be copied recursively). - dst can include subdirectories (foo/bar/baz.txt etc) + """ + Can be overriden in subclass. + + Each element is either a string, or a pair of two strings (src, dst). + + * `src` can be a directory (in which case everything will be copied recursively). + * `dst` can include subdirectories (foo/bar/baz.txt etc) + Uses Hadoop's -files option so that the same file is reused across tasks. - ''' + """ return [] - def _add_link(self, src, dst): + def add_link(self, src, dst): if not hasattr(self, '_links'): self._links = [] self._links.append((src, dst)) @@ -746,9 +804,11 @@ def _setup_links(self): 'Missing files for distributed cache: ' + ', '.join(missing)) - def _dump(self, dir=''): - """Dump instance to file.""" - file_name = os.path.join(dir, 'job-instance.pickle') + def dump(self, directory=''): + """ + Dump instance to file. + """ + file_name = os.path.join(directory, 'job-instance.pickle') if self.__module__ == '__main__': d = pickle.dumps(self) module_name = os.path.basename(sys.argv[0]).rsplit('.', 1)[0] @@ -759,11 +819,14 @@ def _dump(self, dir=''): pickle.dump(self, open(file_name, "w")) def _map_input(self, input_stream): - """Iterate over input and call the mapper for each item. - If the job has a parser defined, the return values from the parser will - be passed as arguments to the mapper. + """ + Iterate over input and call the mapper for each item. + If the job has a parser defined, the return values from the parser will + be passed as arguments to the mapper. - If the input is coded output from a previous run, the arguments will be splitted in key and value.""" + If the input is coded output from a previous run, + the arguments will be splitted in key and value. + """ for record in self.reader(input_stream): for output in self.mapper(*record): yield output @@ -773,7 +836,9 @@ def _map_input(self, input_stream): self._flush_batch_incr_counter() def _reduce_input(self, inputs, reducer, final=NotImplemented): - """Iterate over input, collect values with the same key, and call the reducer for each uniqe key.""" + """ + Iterate over input, collect values with the same key, and call the reducer for each unique key. + """ for key, values in groupby(inputs, key=lambda x: repr(x[0])): for output in reducer(eval(key), (v[1] for v in values)): yield output @@ -782,8 +847,10 @@ def _reduce_input(self, inputs, reducer, final=NotImplemented): yield output self._flush_batch_incr_counter() - def _run_mapper(self, stdin=sys.stdin, stdout=sys.stdout): - """Run the mapper on the hadoop node.""" + def run_mapper(self, stdin=sys.stdin, stdout=sys.stdout): + """ + Run the mapper on the hadoop node. + """ self.init_hadoop() self.init_mapper() outputs = self._map_input((line[:-1] for line in stdin)) @@ -792,27 +859,33 @@ def _run_mapper(self, stdin=sys.stdin, stdout=sys.stdout): else: self.internal_writer(outputs, stdout) - def _run_reducer(self, stdin=sys.stdin, stdout=sys.stdout): - """Run the reducer on the hadoop node.""" + def run_reducer(self, stdin=sys.stdin, stdout=sys.stdout): + """ + Run the reducer on the hadoop node. + """ self.init_hadoop() self.init_reducer() outputs = self._reduce_input(self.internal_reader((line[:-1] for line in stdin)), self.reducer, self.final_reducer) self.writer(outputs, stdout) - def _run_combiner(self, stdin=sys.stdin, stdout=sys.stdout): + def run_combiner(self, stdin=sys.stdin, stdout=sys.stdout): self.init_hadoop() self.init_combiner() outputs = self._reduce_input(self.internal_reader((line[:-1] for line in stdin)), self.combiner, self.final_combiner) self.internal_writer(outputs, stdout) def internal_reader(self, input_stream): - """Reader which uses python eval on each part of a tab separated string. - Yields a tuple of python objects.""" - for input in input_stream: - yield map(eval, input.split("\t")) + """ + Reader which uses python eval on each part of a tab separated string. + Yields a tuple of python objects. + """ + for input_line in input_stream: + yield map(eval, input_line.split("\t")) def internal_writer(self, outputs, stdout): - """Writer which outputs the python repr for each item""" + """ + Writer which outputs the python repr for each item. + """ for output in outputs: print >> stdout, "\t".join(map(repr, output)) diff --git a/luigi/hadoop_jar.py b/luigi/hadoop_jar.py index ba1f4561be..668aad1479 100644 --- a/luigi/hadoop_jar.py +++ b/luigi/hadoop_jar.py @@ -1,3 +1,16 @@ +# Copyright (c) 2015 Spotify AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. import logging import os @@ -10,10 +23,14 @@ def fix_paths(job): - """Coerce input arguments to use temporary files when used for output. + """ + Coerce input arguments to use temporary files when used for output. + Return a list of temporary file pairs (tmpfile, destination path) and - a list of arguments. Converts each HdfsTarget to a string for the - path.""" + a list of arguments. + + Converts each HdfsTarget to a string for the path. + """ tmp_files = [] args = [] for x in job.args(): @@ -33,7 +50,9 @@ def fix_paths(job): class HadoopJarJobRunner(luigi.hadoop.JobRunner): - """JobRunner for `hadoop jar` commands. Used to run a HadoopJarJobTask""" + """ + JobRunner for `hadoop jar` commands. Used to run a HadoopJarJobTask. + """ def __init__(self): pass @@ -43,7 +62,7 @@ def run_job(self, job): # hadoop.HadoopJobRunner if not job.jar() or not os.path.exists(job.jar()): logger.error("Can't find jar: {0}, full path {1}".format(job.jar(), - os.path.abspath(job.jar()))) + os.path.abspath(job.jar()))) raise Exception("job jar does not exist") arglist = luigi.hdfs.load_hadoop_cmd() + ['jar', job.jar()] if job.main(): @@ -65,15 +84,20 @@ def run_job(self, job): class HadoopJarJobTask(luigi.hadoop.BaseHadoopJobTask): - """A job task for `hadoop jar` commands that define a jar and (optional) - main method""" + """ + A job task for `hadoop jar` commands that define a jar and (optional) main method. + """ def jar(self): - """Path to the jar for this Hadoop Job""" + """ + Path to the jar for this Hadoop Job. + """ return None def main(self): - """optional main method for this Hadoop Job""" + """ + optional main method for this Hadoop Job. + """ return None def job_runner(self): @@ -81,10 +105,14 @@ def job_runner(self): return HadoopJarJobRunner() def atomic_output(self): - """If True, then rewrite output arguments to be temp locations and - atomically move them into place after the job finishes""" + """ + If True, then rewrite output arguments to be temp locations and + atomically move them into place after the job finishes. + """ return True def args(self): - """returns an array of args to pass to the job (after hadoop jar
).""" + """ + Returns an array of args to pass to the job (after hadoop jar
). + """ return [] diff --git a/luigi/hdfs.py b/luigi/hdfs.py index 5737b7d79c..278fa783ae 100644 --- a/luigi/hdfs.py +++ b/luigi/hdfs.py @@ -12,23 +12,36 @@ # License for the specific language governing permissions and limitations under # the License. -import subprocess +import datetime +import getpass +import logging import os import random -import urlparse -import luigi.format -import luigi.contrib.target -import datetime import re +import subprocess +import urlparse import warnings -from luigi.target import FileSystem, FileSystemTarget, FileAlreadyExists -import configuration -import logging -import getpass + +import luigi.contrib.target +import luigi.format +from luigi.target import FileAlreadyExists, FileSystem, FileSystemTarget + logger = logging.getLogger('luigi-interface') +class hdfs(luigi.Config): + client_version = luigi.IntParameter(default=None) + effective_user = luigi.Parameter(default=None) + snakebite_autoconfig = luigi.BoolParameter() + namenode_host = luigi.Parameter(default=None) + namenode_port = luigi.IntParameter(default=None) + client = luigi.Parameter(default=None) + use_snakebite = luigi.BoolParameter(default=None) + tmp_dir = luigi.Parameter(config_path=dict(section='core', name='hdfs-tmp-dir'), default=None) + + class HDFSCliError(Exception): + def __init__(self, command, returncode, stdout, stderr): self.returncode = returncode self.stdout = stdout @@ -66,31 +79,31 @@ def tmppath(path=None, include_unix_username=True): addon = "luigitemp-%08d" % random.randrange(1e9) temp_dir = '/tmp' # default tmp dir if none is specified in config - #1. Figure out to which temporary directory to place - configured_hdfs_tmp_dir = configuration.get_config().get('core', 'hdfs-tmp-dir', None) + # 1. Figure out to which temporary directory to place + configured_hdfs_tmp_dir = hdfs().tmp_dir if configured_hdfs_tmp_dir is not None: - #config is superior + # config is superior base_dir = configured_hdfs_tmp_dir elif path is not None: - #need to copy correct schema and network location + # need to copy correct schema and network location parsed = urlparse.urlparse(path) base_dir = urlparse.urlunparse((parsed.scheme, parsed.netloc, temp_dir, '', '', '')) else: - #just system temporary directory + # just system temporary directory base_dir = temp_dir - #2. Figure out what to place + # 2. Figure out what to place if path is not None: if path.startswith(temp_dir + '/'): - #Not 100%, but some protection from directories like /tmp/tmp/file + # Not 100%, but some protection from directories like /tmp/tmp/file subdir = path[len(temp_dir):] else: - #Protection from /tmp/hdfs:/dir/file + # Protection from /tmp/hdfs:/dir/file parsed = urlparse.urlparse(path) subdir = parsed.path subdir = subdir.lstrip('/') + '-' else: - #just return any random temporary location + # just return any random temporary location subdir = '' if include_unix_username: @@ -98,6 +111,7 @@ def tmppath(path=None, include_unix_username=True): return os.path.join(base_dir, subdir + addon) + def list_path(path): if isinstance(path, list) or isinstance(path, tuple): return path @@ -105,13 +119,47 @@ def list_path(path): return [path, ] return [str(path), ] + +def is_dangerous_rm_path(path): + """ Determines if it is risky to remove such a path. + + Examples: + * blanks + * top level root, e.g. / + * absolute path that is one level deep, e.g. /etc or /opt + * tilde, e.g. ~ + + :return bool: True if too dangerous + + >>> for danger in ['~', '~/', ' ', '/', '/opt', '/etc/', '/etc//', + ... '//', ' /opt ', ' /opt// ', '//opt']: + ... assert is_dangerous_rm_path(danger), 'expected dangerous: %r' % danger + >>> for safe in ['~/foo', '/foo/bar', 'foo', ' foo ', 'bar/', 'silly//']: + ... assert not is_dangerous_rm_path(safe), 'expected safe: %r' % safe + >>> try: + ... is_dangerous_rm_path(None) + ... except AttributeError: + ... pass + """ + path = path.strip().rstrip('/') + + if path.startswith('/'): + path = path.lstrip('/') + return len(path.split('/')) <= 1 + else: + return path in ('', '~') + + class HdfsClient(FileSystem): - """This client uses Apache 2.x syntax for file system commands, which also matched CDH4""" + """ + This client uses Apache 2.x syntax for file system commands, which also matched CDH4. + """ recursive_listdir_cmd = ['-ls', '-R'] def exists(self, path): - """ Use ``hadoop fs -stat`` to check file existence + """ + Use ``hadoop fs -stat`` to check file existence. """ cmd = load_hadoop_cmd() + ['fs', '-stat', path] @@ -138,9 +186,35 @@ def rename(self, path, dest): warnings.warn("Renaming multiple files at once is not atomic.") call_check(load_hadoop_cmd() + ['fs', '-mv'] + path + [dest]) - def remove(self, path, recursive=True, skip_trash=False): + def rename_dont_move(self, path, dest): + """ + Override this method with an implementation that uses rename2, + which is a rename operation that never moves. + + For instance, `rename2 a b` never moves `a` into `b` folder. + + Currently, the hadoop cli does not support this operation. + + We keep the interface simple by just aliasing this to + normal rename and let individual implementations redefine the method. + + rename2 - + https://github.com/apache/hadoop/blob/ae91b13/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/protocol/ClientProtocol.java + (lines 483-523) + """ + warnings.warn("Configured HDFS client doesn't support rename_dont_move, using normal mv operation instead.") + if self.exists(dest): + return False + self.rename(path, dest) + return True + + def remove(self, path, recursive=True, skip_trash=False, chicken=True): if recursive: cmd = load_hadoop_cmd() + ['fs', '-rm', '-r'] + + if chicken and is_dangerous_rm_path(path): + raise ValueError("Too chicken to recursively " + "delete '%s'" % path) else: cmd = load_hadoop_cmd() + ['fs', '-rm'] @@ -177,7 +251,7 @@ def count(self, path): if line.startswith("OpenJDK 64-Bit Server VM warning") or line.startswith("It's highly recommended") or not line: lines.pop(lines.index(line)) else: - (dir_count, file_count, content_size, ppath) = stdout.split() + (dir_count, file_count, content_size, ppath) = stdout.split() results = {'content_size': content_size, 'dir_count': dir_count, 'file_count': file_count} return results @@ -205,7 +279,7 @@ def mkdir(self, path, parents=True, raise_if_exists=False): (['-p'] if parents else []) + [path]) call_check(cmd) - except HDFSCliError, ex: + except HDFSCliError as ex: if "File exists" in ex.stderr: if raise_if_exists: raise FileAlreadyExists(ex.stderr) @@ -254,16 +328,18 @@ def listdir(self, path, ignore_directories=False, ignore_files=False, else: yield file + class SnakebiteHdfsClient(HdfsClient): """ This client uses Spotify's snakebite client whenever possible. + @author: Alan Brenner github.com/alanbbr """ + def __init__(self): super(SnakebiteHdfsClient, self).__init__() try: from snakebite.client import Client - self.config = configuration.get_config() self._bite = None self.pid = -1 except Exception as err: # IGNORE:broad-except @@ -284,28 +360,23 @@ def get_bite(self): """ If Luigi has forked, we have a different PID, and need to reconnect. """ + config = hdfs() if self.pid != os.getpid() or not self._bite: - autoconfig_enabled = self.config.getboolean("hdfs", "snakebite_autoconfig", False) - if autoconfig_enabled is True: + client_kwargs = dict(filter(lambda k_v: k_v[1] is not None and k_v[1] != '', { + 'hadoop_version': config.client_version, + 'effective_user': config.effective_user, + }.iteritems())) + if config.snakebite_autoconfig: """ This is fully backwards compatible with the vanilla Client and can be used for a non HA cluster as well. This client tries to read ``${HADOOP_PATH}/conf/hdfs-site.xml`` to get the address of the namenode. The behaviour is the same as Client. """ from snakebite.client import AutoConfigClient - self._bite = AutoConfigClient() + self._bite = AutoConfigClient(**client_kwargs) else: from snakebite.client import Client - try: - ver = self.config.getint("hdfs", "client_version") - if ver is None: - raise RuntimeError() - self._bite = Client(self.config.get("hdfs", "namenode_host"), - self.config.getint("hdfs", "namenode_port"), - hadoop_version=ver) - except: - self._bite = Client(self.config.get("hdfs", "namenode_host"), - self.config.getint("hdfs", "namenode_port")) + self._bite = Client(config.namenode_host, config.namenode_port, **client_kwargs) return self._bite def exists(self, path): @@ -338,7 +409,25 @@ def rename(self, path, dest): self.mkdir(dir_path, parents=True) return list(self.get_bite().rename(list_path(path), dest)) - def remove(self, path, recursive=True, skip_trash=False): + def rename_dont_move(self, path, dest): + """ + Use snakebite.rename_dont_move, if available. + + :param path: source path (single input) + :type path: string + :param dest: destination path + :type dest: string + :return: True if succeeded + :raises: snakebite.errors.FileAlreadyExistsException + """ + from snakebite.errors import FileAlreadyExistsException + try: + self.get_bite().rename2(path, dest, overwriteDest=False) + return True + except FileAlreadyExistsException: + return False + + def remove(self, path, recursive=True, skip_trash=False, chicken=None): """ Use snakebite.delete, if available. @@ -348,6 +437,8 @@ def remove(self, path, recursive=True, skip_trash=False): :type recursive: boolean, default is True :param skip_trash: do or don't move deleted items into the trash first :type skip_trash: boolean, default is False (use trash) + :param chicken: ignored + :type chicken: ignored :return: list of deleted items """ return list(self.get_bite().delete(list_path(path), recurse=recursive)) @@ -365,7 +456,7 @@ def chmod(self, path, permissions, recursive=False): :return: list of all changed items """ return list(self.get_bite().chmod(list_path(path), - permissions, recursive)) + permissions, recursive)) def chown(self, path, owner, group, recursive=False): """ @@ -419,7 +510,7 @@ def get(self, path, local_destination): return list(self.get_bite().copyToLocal(list_path(path), local_destination)) - def mkdir(self, path, parents=True, mode=0755, raise_if_exists=False): + def mkdir(self, path, parents=True, mode=0o755, raise_if_exists=False): """ Use snakebite.mkdir, if available. @@ -480,23 +571,34 @@ def listdir(self, path, ignore_directories=False, ignore_files=False, else: yield rval[0] + class HdfsClientCdh3(HdfsClient): - """This client uses CDH3 syntax for file system commands""" - def mkdir(self, path): - ''' - No -p switch, so this will fail creating ancestors - ''' + """ + This client uses CDH3 syntax for file system commands. + """ + + def mkdir(self, path, parents=False, raise_if_exists=False): + """ + No -p switch, so this will fail creating ancestors. + + :param parents: ignored + """ try: call_check(load_hadoop_cmd() + ['fs', '-mkdir', path]) - except HDFSCliError, ex: + except HDFSCliError as ex: if "File exists" in ex.stderr: - raise FileAlreadyExists(ex.stderr) + if raise_if_exists: + raise FileAlreadyExists(ex.stderr) else: raise - def remove(self, path, recursive=True, skip_trash=False): + def remove(self, path, recursive=True, skip_trash=False, chicken=True): if recursive: cmd = load_hadoop_cmd() + ['fs', '-rmr'] + + if chicken and is_dangerous_rm_path(path): + raise ValueError("Too chicken to recursively " + "delete '%s'" % path) else: cmd = load_hadoop_cmd() + ['fs', '-rm'] @@ -506,9 +608,12 @@ def remove(self, path, recursive=True, skip_trash=False): cmd = cmd + [path] call_check(cmd) + class HdfsClientApache1(HdfsClientCdh3): - """This client uses Apache 1.x syntax for file system commands, - which are similar to CDH3 except for the file existence check""" + """ + This client uses Apache 1.x syntax for file system commands, + which are similar to CDH3 except for the file existence check. + """ recursive_listdir_cmd = ['-lsr'] @@ -527,23 +632,27 @@ def exists(self, path): def get_configured_hadoop_version(): """ CDH4 (hadoop 2+) has a slightly different syntax for interacting with hdfs - via the command line. The default version is CDH4, but one can override + via the command line. + + The default version is CDH4, but one can override this setting with "cdh3" or "apache1" in the hadoop section of the config - in order to use the old syntax + in order to use the old syntax. """ - return configuration.get_config().get("hadoop", "version", "cdh4").lower() + return luigi.configuration.get_config().get("hadoop", "version", "cdh4").lower() def get_configured_hdfs_client(show_warnings=True): - """ This is a helper that fetches the configuration value for 'client' in + """ + This is a helper that fetches the configuration value for 'client' in the [hdfs] section. It will return the client that retains backwards - compatibility when 'client' isn't configured. """ - config = configuration.get_config() - custom = config.get("hdfs", "client", None) + compatibility when 'client' isn't configured. + """ + config = hdfs() + custom = config.client if custom: # Eventually this should be the only valid code path return custom - if config.getboolean("hdfs", "use_snakebite", False): + if config.use_snakebite: if show_warnings: warnings.warn("Deprecated: Just specify 'client: snakebite' in config") return "snakebite" @@ -553,8 +662,10 @@ def get_configured_hdfs_client(show_warnings=True): def create_hadoopcli_client(): - """ Given that we want one of the hadoop cli clients (unlike snakebite), - this one will return the right one """ + """ + Given that we want one of the hadoop cli clients (unlike snakebite), + this one will return the right one. + """ version = get_configured_hadoop_version() if version == "cdh4": return HdfsClient() @@ -566,8 +677,11 @@ def create_hadoopcli_client(): raise Exception("Error: Unknown version specified in Hadoop version" "configuration parameter") + def get_autoconfig_client(show_warnings=True): - """Creates the client as specified in the `client.cfg` configuration""" + """ + Creates the client as specified in the `client.cfg` configuration. + """ configured_client = get_configured_hdfs_client(show_warnings=show_warnings) if configured_client == "snakebite": return SnakebiteHdfsClient() @@ -588,12 +702,14 @@ def get_autoconfig_client(show_warnings=True): class HdfsReadPipe(luigi.format.InputPipeProcessWrapper): + def __init__(self, path): super(HdfsReadPipe, self).__init__(load_hadoop_cmd() + ['fs', '-cat', path]) class HdfsAtomicWritePipe(luigi.format.OutputPipeProcessWrapper): - """ File like object for writing to HDFS + """ + File like object for writing to HDFS The referenced file is first written to a temporary location and then renamed to final location on close(). If close() isn't called @@ -613,7 +729,7 @@ def __init__(self, path): def abort(self): logger.info("Aborting %s('%s'). Removing temporary file '%s'", - self.__class__.__name__, self.path, self.tmppath) + self.__class__.__name__, self.path, self.tmppath) super(HdfsAtomicWritePipe, self).abort() remove(self.tmppath) @@ -623,7 +739,10 @@ def close(self): class HdfsAtomicWriteDirPipe(luigi.format.OutputPipeProcessWrapper): - """ Writes a data file to a directory at """ + """ + Writes a data file to a directory at . + """ + def __init__(self, path, data_extension=""): self.path = path self.tmppath = tmppath(self.path) @@ -632,7 +751,7 @@ def __init__(self, path, data_extension=""): def abort(self): logger.info("Aborting %s('%s'). Removing temporary dir '%s'", - self.__class__.__name__, self.path, self.tmppath) + self.__class__.__name__, self.path, self.tmppath) super(HdfsAtomicWriteDirPipe, self).abort() remove(self.tmppath) @@ -642,6 +761,7 @@ def close(self): class Plain(luigi.format.Format): + @classmethod def hdfs_reader(cls, path): return HdfsReadPipe(path) @@ -652,6 +772,7 @@ def pipe_writer(cls, output_pipe): class PlainDir(luigi.format.Format): + @classmethod def hdfs_reader(cls, path): # exclude underscore-prefixedfiles/folders (created by MapReduce) @@ -677,7 +798,7 @@ def __init__(self, path=None, format=Plain, is_tmp=False, fs=None): self._fs = fs or get_autoconfig_client() def __del__(self): - #TODO: not sure is_tmp belongs in Targets construction arguments + # TODO: not sure is_tmp belongs in Targets construction arguments if self.is_tmp and self.exists(): self.remove() @@ -706,12 +827,13 @@ def open(self, mode='r'): except NotImplementedError: return self.format.pipe_writer(HdfsAtomicWritePipe(self.path)) - def remove(self, skip_trash=False): - remove(self.path, skip_trash=skip_trash) + def remove(self, skip_trash=False, chicken=True): + remove(self.path, skip_trash=skip_trash, chicken=chicken) @luigi.util.deprecate_kwarg('fail_if_exists', 'raise_if_exists', False) def rename(self, path, fail_if_exists=False): - """ Rename does not change self.path, so be careful with assumptions + """ + Rename does not change self.path, so be careful with assumptions. Not recommendeed for directories. Use move_dir. spotify/luigi#522 """ @@ -723,19 +845,29 @@ def rename(self, path, fail_if_exists=False): @luigi.util.deprecate_kwarg('fail_if_exists', 'raise_if_exists', False) def move(self, path, fail_if_exists=False): - """ Move does not change self.path, so be careful with assumptions + """ + Move does not change self.path, so be careful with assumptions. Not recommendeed for directories. Use move_dir. spotify/luigi#522 """ self.rename(path, raise_if_exists=fail_if_exists) def move_dir(self, path): - # mkdir will fail if directory already exists, thereby ensuring atomicity - if isinstance(path, HdfsTarget): - path = path.path - mkdir(path, parents=False, raise_if_exists=True) - rename(self.path + '/*', path) - self.remove() + """ + Rename a directory. + + The implementation uses `rename_dont_move`, + which on some clients is just a normal `mv` operation, which can cause + nested directories. + + One could argue that the implementation should use the + mkdir+raise_if_exists approach, but we at Spotify have had more trouble + with that over just using plain mv. See spotify/luigi#557 + """ + move_succeeded = self.fs.rename_dont_move(self.path, path) + if move_succeeded: + self.path = path + return move_succeeded def is_writable(self): if "/" in self.path: diff --git a/luigi/hive.py b/luigi/hive.py index 2cb0d1e111..a0f683ce9f 100644 --- a/luigi/hive.py +++ b/luigi/hive.py @@ -13,18 +13,20 @@ import abc import logging import operator -import luigi -import luigi.hadoop -from luigi.target import FileSystemTarget, FileAlreadyExists import os import subprocess import tempfile + +import luigi +import luigi.hadoop +from luigi.target import FileAlreadyExists, FileSystemTarget from luigi.task import flatten logger = logging.getLogger('luigi-interface') class HiveCommandError(RuntimeError): + def __init__(self, message, out=None, err=None): super(HiveCommandError, self).__init__(message, out, err) self.message = message @@ -41,12 +43,13 @@ def get_hive_syntax(): def run_hive(args, check_return_code=True): - """Runs the `hive` from the command line, passing in the given args, and - returning stdout. + """ + Runs the `hive` from the command line, passing in the given args, and + returning stdout. - With the apache release of Hive, so of the table existence checks - (which are done using DESCRIBE do not exit with a return code of 0 - so we need an option to ignore the return code and just return stdout for parsing + With the apache release of Hive, so of the table existence checks + (which are done using DESCRIBE do not exit with a return code of 0 + so we need an option to ignore the return code and just return stdout for parsing """ cmd = [load_hive_cmd()] + args p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) @@ -58,12 +61,16 @@ def run_hive(args, check_return_code=True): def run_hive_cmd(hivecmd, check_return_code=True): - """Runs the given hive query and returns stdout""" + """ + Runs the given hive query and returns stdout. + """ return run_hive(['-e', hivecmd], check_return_code) def run_hive_script(script): - """Runs the contents of the given script in hive and returns stdout""" + """ + Runs the contents of the given script in hive and returns stdout. + """ if not os.path.isfile(script): raise RuntimeError("Hive script: {0} does not exist.".format(script)) return run_hive(['-f', script]) @@ -73,7 +80,7 @@ class HiveClient(object): # interface __metaclass__ = abc.ABCMeta @abc.abstractmethod - def table_location(self, table, database='default', partition={}): + def table_location(self, table, database='default', partition=None): """ Returns location of db.table (or db.table.partition). partition is a dict of partition key to value. @@ -82,13 +89,15 @@ def table_location(self, table, database='default', partition={}): @abc.abstractmethod def table_schema(self, table, database='default'): - """ Returns list of [(name, type)] for each column in database.table """ + """ + Returns list of [(name, type)] for each column in database.table. + """ pass @abc.abstractmethod - def table_exists(self, table, database='default', partition={}): + def table_exists(self, table, database='default', partition=None): """ - Returns true iff db.table (or db.table.partition) exists. partition is a dict of partition key to + Returns true if db.table (or db.table.partition) exists. partition is a dict of partition key to value. """ pass @@ -100,10 +109,13 @@ def partition_spec(self, partition): class HiveCommandClient(HiveClient): - """ Uses `hive` invocations to find information """ - def table_location(self, table, database='default', partition={}): + """ + Uses `hive` invocations to find information. + """ + + def table_location(self, table, database='default', partition=None): cmd = "use {0}; describe formatted {1}".format(database, table) - if partition: + if partition is not None: cmd += " PARTITION ({0})".format(self.partition_spec(partition)) stdout = run_hive_cmd(cmd) @@ -112,8 +124,8 @@ def table_location(self, table, database='default', partition={}): if "Location:" in line: return line.split("\t")[1] - def table_exists(self, table, database='default', partition={}): - if not partition: + def table_exists(self, table, database='default', partition=None): + if partition is None: stdout = run_hive_cmd('use {0}; show tables like "{1}";'.format(database, table)) return stdout and table in stdout @@ -133,9 +145,11 @@ def table_schema(self, table, database='default'): return [tuple([x.strip() for x in line.strip().split("\t")]) for line in describe.strip().split("\n")] def partition_spec(self, partition): - """ Turns a dict into the a Hive partition specification string """ + """ + Turns a dict into the a Hive partition specification string. + """ return ','.join(["{0}='{1}'".format(k, v) for (k, v) in - sorted(partition.items(), key=operator.itemgetter(0))]) + sorted(partition.iteritems(), key=operator.itemgetter(0))]) class ApacheHiveCommandClient(HiveCommandClient): @@ -143,6 +157,7 @@ class ApacheHiveCommandClient(HiveCommandClient): A subclass for the HiveCommandClient to (in some cases) ignore the return code from the hive command so that we can just parse the output. """ + def table_schema(self, table, database='default'): describe = run_hive_cmd("use {0}; describe {1}".format(database, table), False) if not describe or "Table not found" in describe: @@ -151,18 +166,19 @@ def table_schema(self, table, database='default'): class MetastoreClient(HiveClient): - def table_location(self, table, database='default', partition={}): + + def table_location(self, table, database='default', partition=None): with HiveThriftContext() as client: - if partition: + if partition is not None: partition_str = self.partition_spec(partition) thrift_table = client.get_partition_by_name(database, table, partition_str) else: thrift_table = client.get_table(database, table) return thrift_table.sd.location - def table_exists(self, table, database='default', partition={}): + def table_exists(self, table, database='default', partition=None): with HiveThriftContext() as client: - if not partition: + if partition is None: return table in client.get_all_tables(database) else: return partition in self._existing_partitions(table, database, client) @@ -184,11 +200,14 @@ def table_schema(self, table, database='default'): return [(field_schema.name, field_schema.type) for field_schema in client.get_schema(database, table)] def partition_spec(self, partition): - return "/".join("%s=%s" % (k, v) for (k, v) in sorted(partition.items(), key=operator.itemgetter(0))) + return "/".join("%s=%s" % (k, v) for (k, v) in sorted(partition.iteritems(), key=operator.itemgetter(0))) class HiveThriftContext(object): - """ Context manager for hive metastore client """ + """ + Context manager for hive metastore client. + """ + def __enter__(self): try: from thrift import Thrift @@ -208,7 +227,7 @@ def __enter__(self): transport.open() self.transport = transport return ThriftHiveMetastore.Client(protocol) - except ImportError, e: + except ImportError as e: raise Exception('Could not import Hive thrift library:' + str(e)) def __exit__(self, exc_type, exc_val, exc_tb): @@ -222,7 +241,10 @@ def __exit__(self, exc_type, exc_val, exc_tb): class HiveQueryTask(luigi.hadoop.BaseHadoopJobTask): - """ Task to run a hive query """ + """ + Task to run a hive query. + """ + # by default, we let hive figure these out. n_reduce_tasks = None bytes_per_reducer = None @@ -234,10 +256,12 @@ def query(self): raise RuntimeError("Must implement query!") def hiverc(self): - """ Location of an rc file to run before the query - if hiverc-location key is specified in client.cfg, will default to the value there - otherwise returns None - Returning a list of rc files will load all of them in order. + """ + Location of an rc file to run before the query + if hiverc-location key is specified in client.cfg, will default to the value there + otherwise returns None. + + Returning a list of rc files will load all of them in order. """ return luigi.configuration.get_config().get('hive', 'hiverc-location', default=None) @@ -246,6 +270,7 @@ def hiveconfs(self): Returns an dict of key=value settings to be passed along to the hive command line via --hiveconf. By default, sets mapred.job.name to task_id and if not None, sets: + * mapred.reduce.tasks (n_reduce_tasks) * mapred.fairscheduler.pool (pool) or mapred.job.queue.name (pool) * hive.exec.reducers.bytes.per.reducer (bytes_per_reducer) @@ -273,10 +298,13 @@ def job_runner(self): class HiveQueryRunner(luigi.hadoop.JobRunner): - """ Runs a HiveQueryTask by shelling out to hive """ + """ + Runs a HiveQueryTask by shelling out to hive. + """ def prepare_outputs(self, job): - """ Called before job is started + """ + Called before job is started. If output is a `FileSystemTarget`, create parent directories so the hive command won't fail """ @@ -301,7 +329,7 @@ def run_job(self, job): arglist = [load_hive_cmd(), '-f', f.name] hiverc = job.hiverc() if hiverc: - if type(hiverc) == str: + if isinstance(hiverc, str): hiverc = [hiverc] for rcfile in hiverc: arglist += ['-i', rcfile] @@ -314,7 +342,9 @@ def run_job(self, job): class HiveTableTarget(luigi.Target): - """ exists returns true if the table exists """ + """ + exists returns true if the table exists. + """ def __init__(self, table, database='default', client=default_client): self.database = database @@ -328,7 +358,9 @@ def exists(self): @property def path(self): - """Returns the path to this table in HDFS""" + """ + Returns the path to this table in HDFS. + """ location = self.client.table_location(self.table, self.database) if not location: raise Exception("Couldn't find location for table: {0}".format(str(self))) @@ -339,7 +371,9 @@ def open(self, mode): class HivePartitionTarget(luigi.Target): - """ exists returns true if the table's partition exists """ + """ + exists returns true if the table's partition exists. + """ def __init__(self, table, partition, database='default', fail_missing_table=True, client=default_client): self.database = database @@ -353,7 +387,7 @@ def exists(self): try: logger.debug("Checking Hive table '{d}.{t}' for partition {p}".format(d=self.database, t=self.table, p=str(self.partition))) return self.client.table_exists(self.table, self.database, self.partition) - except HiveCommandError, e: + except HiveCommandError as e: if self.fail_missing_table: raise else: @@ -366,7 +400,9 @@ def exists(self): @property def path(self): - """Returns the path for this HiveTablePartitionTarget's data""" + """ + Returns the path for this HiveTablePartitionTarget's data. + """ location = self.client.table_location(self.table, self.database, self.partition) if not location: raise Exception("Couldn't find location for table: {0}".format(str(self))) @@ -377,7 +413,9 @@ def open(self, mode): class ExternalHiveTask(luigi.ExternalTask): - """ External task that depends on a Hive table/partition """ + """ + External task that depends on a Hive table/partition. + """ database = luigi.Parameter(default='default') table = luigi.Parameter() diff --git a/luigi/interface.py b/luigi/interface.py index 30874479c8..379d9f30a2 100644 --- a/luigi/interface.py +++ b/luigi/interface.py @@ -12,21 +12,21 @@ # License for the specific language governing permissions and limitations under # the License. -import worker -import lock +import argparse import logging import logging.config -import rpc import optparse -import scheduler -import warnings +import os +import sys +import tempfile + import configuration -import task +import lock import parameter -import re -import argparse -import sys -import os +import rpc +import scheduler +import task +import worker from task import Register @@ -39,106 +39,68 @@ def setup_interface_logging(conf_file=None): logger = logging.getLogger('luigi-interface') logger.setLevel(logging.DEBUG) - streamHandler = logging.StreamHandler() - streamHandler.setLevel(logging.DEBUG) + stream_handler = logging.StreamHandler() + stream_handler.setLevel(logging.DEBUG) formatter = logging.Formatter('%(levelname)s: %(message)s') - streamHandler.setFormatter(formatter) + stream_handler.setFormatter(formatter) - logger.addHandler(streamHandler) + logger.addHandler(stream_handler) else: logging.config.fileConfig(conf_file, disable_existing_loggers=False) setup_interface_logging.has_run = True -def load_task(parent_task, task_name, params): - """ Imports task and uses ArgParseInterface to initialize it - """ - # How the module is represented depends on if Luigi was started from - # that file or if the module was imported later on - module = sys.modules[parent_task.__module__] - if module.__name__ == '__main__': - parent_module_path = os.path.abspath(module.__file__) - for p in sys.path: - if parent_module_path.startswith(p): - end = parent_module_path.rfind('.py') - actual_module = parent_module_path[len(p):end].strip( - '/').replace('/', '.') - break - else: - actual_module = module.__name__ - return init_task(actual_module, task_name, params, {}) - - -def init_task(module_name, task, str_params, global_str_params): - __import__(module_name) - module = sys.modules[module_name] - Task = getattr(module, task) - - return Task.from_str_params(str_params, global_str_params) - +class EnvironmentParamsContainer(task.ConfigWithoutSection): -class EnvironmentParamsContainer(task.Task): ''' Keeps track of a bunch of environment params. Uses the internal luigi parameter mechanism. The nice thing is that we can instantiate this class and get an object with all the environment variables set. - This is arguably a bit of a hack.''' + This is arguably a bit of a hack. + ''' - local_scheduler = parameter.BooleanParameter( - is_global=True, default=False, + local_scheduler = parameter.BoolParameter( + default=False, description='Use local scheduling') scheduler_host = parameter.Parameter( - is_global=True, default='localhost', description='Hostname of machine running remote scheduler', config_path=dict(section='core', name='default-scheduler-host')) scheduler_port = parameter.IntParameter( - is_global=True, default=8082, + default=8082, description='Port of remote scheduler api process', config_path=dict(section='core', name='default-scheduler-port')) - lock = parameter.BooleanParameter( - is_global=True, default=False, - description='(Deprecated, replaced by no_lock)' - 'Do not run if similar process is already running') lock_size = parameter.IntParameter( - is_global=True, default=1, + default=1, description="Maximum number of workers running the same command") - no_lock = parameter.BooleanParameter( - is_global=True, default=False, + no_lock = parameter.BoolParameter( + default=False, description='Ignore if similar process is already running') lock_pid_dir = parameter.Parameter( - is_global=True, default='/var/tmp/luigi', + default=os.path.join(tempfile.gettempdir(), 'luigi'), description='Directory to store the pid file') workers = parameter.IntParameter( - is_global=True, default=1, + default=1, description='Maximum number of parallel tasks to run') logging_conf_file = parameter.Parameter( - is_global=True, default=None, + default=None, description='Configuration file for logging', config_path=dict(section='core', name='logging_conf_file')) module = parameter.Parameter( - is_global=True, default=None, - description='Used for dynamic loading of modules') # see DynamicArgParseInterface - parallel_scheduling = parameter.BooleanParameter( - is_global=True, default=False, + default=None, + description='Used for dynamic loading of modules') # see DynamicArgParseInterface + parallel_scheduling = parameter.BoolParameter( + default=False, description='Use multiprocessing to do scheduling in parallel.', config_path={'section': 'core', 'name': 'parallel-scheduling'}, ) - @classmethod - def env_params(cls, override_defaults={}): - # Override any global parameter with whatever is in override_defaults - for param_name, param_obj in cls.get_global_params(): - if param_name in override_defaults: - param_obj.set_global(override_defaults[param_name]) - - return cls() # instantiate an object with the global params set on it - class WorkerSchedulerFactory(object): + def create_local_scheduler(self): return scheduler.CentralPlannerScheduler() @@ -151,19 +113,23 @@ def create_worker(self, scheduler, worker_processes): class Interface(object): + def parse(self): raise NotImplementedError @staticmethod - def run(tasks, worker_scheduler_factory=None, override_defaults={}): + def run(tasks, worker_scheduler_factory=None, override_defaults=None): """ - :return: True if all tasks and their dependencies were successfully run (or already completed) - False if any error occurred + :param tasks: + :param worker_scheduler_factory: + :param override_defaults: + :return: True if all tasks and their dependencies were successfully run (or already completed); + False if any error occurred. """ if worker_scheduler_factory is None: worker_scheduler_factory = WorkerSchedulerFactory() - env_params = EnvironmentParamsContainer.env_params(override_defaults) + env_params = EnvironmentParamsContainer(**override_defaults) # search for logging configuration path first on the command line, then # in the application config file logging_conf = env_params.logging_conf_file @@ -176,15 +142,6 @@ def run(tasks, worker_scheduler_factory=None, override_defaults={}): 'core', 'no_configure_logging', False): setup_interface_logging(logging_conf) - if env_params.lock: - warnings.warn( - "The --lock flag is deprecated and will be removed." - "Locking is now the default behavior." - "Use --no-lock to override to not use lock", - DeprecationWarning, - stacklevel=3 - ) - if (not env_params.no_lock and not(lock.acquire_for(env_params.lock_pid_dir, env_params.lock_size))): sys.exit(1) @@ -209,129 +166,138 @@ def run(tasks, worker_scheduler_factory=None, override_defaults={}): return success -class ErrorWrappedArgumentParser(argparse.ArgumentParser): - ''' Wraps ArgumentParser's error message to suggested similar tasks - ''' +# Simple unweighted Levenshtein distance +def _editdistance(a, b): + r0 = range(0, len(b) + 1) + r1 = [0] * (len(b) + 1) - # Simple unweighted Levenshtein distance - def _editdistance(self, a, b): - r0 = range(0, len(b) + 1) - r1 = [0] * (len(b) + 1) + for i in range(0, len(a)): + r1[0] = i + 1 - for i in range(0, len(a)): - r1[0] = i + 1 + for j in range(0, len(b)): + c = 0 if a[i] is b[j] else 1 + r1[j + 1] = min(r1[j] + 1, r0[j + 1] + 1, r0[j] + c) - for j in range(0, len(b)): - c = 0 if a[i] is b[j] else 1 - r1[j + 1] = min(r1[j] + 1, r0[j + 1] + 1, r0[j] + c) + r0 = r1[:] - r0 = r1[:] + return r1[len(b)] - return r1[len(b)] - def error(self, message): - result = re.match("argument .+: invalid choice: '(\w+)'.+", message) - if result: - arg = result.group(1) - weightedTasks = [(self._editdistance(arg, task), task) for task in Register.get_reg().keys()] - orderedTasks = sorted(weightedTasks, key=lambda pair: pair[0]) - candidates = [task for (dist, task) in orderedTasks if dist <= 5 and dist < len(task)] - displaystring = "" - if candidates: - displaystring = "No task %s. Did you mean:\n%s" % (arg, '\n'.join(candidates)) - else: - displaystring = "No task %s." % arg - super(ErrorWrappedArgumentParser, self).error(displaystring) - else: - super(ErrorWrappedArgumentParser, self).error(message) +def error_task_names(task_name, task_names): + weighted_tasks = [(_editdistance(task_name, task_name_2), task_name_2) for task_name_2 in task_names] + ordered_tasks = sorted(weighted_tasks, key=lambda pair: pair[0]) + candidates = [task for (dist, task) in ordered_tasks if dist <= 5 and dist < len(task)] + display_string = "" + if candidates: + display_string = "No task %s. Did you mean:\n%s" % (task_name, '\n'.join(candidates)) + else: + display_string = "No task %s." % task_name + raise SystemExit(display_string) -class ArgParseInterface(Interface): - ''' Takes the task as the command, with parameters specific to it - ''' - @classmethod - def add_parameter(cls, parser, param_name, param, prefix=None): - description = [] - if prefix: - description.append('%s.%s' % (prefix, param_name)) - else: - description.append(param_name) - if param.description: - description.append(param.description) - if param.has_value: - description.append(" [default: %s]" % (param.value,)) - - if param.is_list: - action = "append" - elif param.is_boolean: - action = "store_true" - else: - action = "store" - parser.add_argument('--' + param_name.replace('_', '-'), help=' '.join(description), default=None, action=action) - @classmethod - def add_task_parameters(cls, parser, task_cls): - for param_name, param in task_cls.get_nonglobal_params(): - cls.add_parameter(parser, param_name, param, task_cls.task_family) +def add_task_parameters(parser, task_cls, optparse=False): + for param_name, param in task_cls.get_params(): + param.add_to_cmdline_parser(parser, param_name, task_cls.task_family, optparse=optparse, glob=False) - @classmethod - def add_global_parameters(cls, parser): - for param_name, param in Register.get_global_params(): - cls.add_parameter(parser, param_name, param) - def parse_task(self, cmdline_args=None, main_task_cls=None): - parser = ErrorWrappedArgumentParser() +def add_global_parameters(parser, optparse=False): + seen_params = set() + for task_name, is_without_section, param_name, param in Register.get_all_params(): + if param in seen_params: + continue + seen_params.add(param) + param.add_to_cmdline_parser(parser, param_name, task_name, optparse=optparse, glob=True, is_without_section=is_without_section) - self.add_global_parameters(parser) - if main_task_cls: - self.add_task_parameters(parser, main_task_cls) +def get_task_parameters(task_cls, args): + # Parse a str->str dict to the correct types + params = {} + for param_name, param in task_cls.get_params(): + param.parse_from_args(param_name, task_cls.task_family, args, params) + return params + + +def set_global_parameters(args): + # Note that this is not side effect free + for task_name, is_without_section, param_name, param in Register.get_all_params(): + param.set_global_from_args(param_name, task_name, args, is_without_section=is_without_section) - else: - orderedtasks = '{%s}' % ','.join(sorted(Register.get_reg().keys())) - subparsers = parser.add_subparsers(dest='command', metavar=orderedtasks) - for name, cls in Register.get_reg().iteritems(): - subparser = subparsers.add_parser(name) - if cls == Register.AMBIGUOUS_CLASS: - continue - self.add_task_parameters(subparser, cls) +class ArgParseInterface(Interface): + """ + Takes the task as the command, with parameters specific to it. + """ - # Add global params here as well so that we can support both: - # test.py --global-param xyz Test --n 42 - # test.py Test --n 42 --global-param xyz - self.add_global_parameters(subparser) + def parse_task(self, cmdline_args=None, main_task_cls=None): + parser = argparse.ArgumentParser() - args = parser.parse_args(args=cmdline_args) - params = vars(args) # convert to a str -> str hash + add_global_parameters(parser) if main_task_cls: + add_task_parameters(parser, main_task_cls) + + args = parser.parse_args(args=cmdline_args) task_cls = main_task_cls else: - task_cls = Register.get_task_cls(args.command) + task_names = sorted(Register.get_reg().keys()) + + # Parse global arguments and pull out the task name. + # We used to do this using subparsers+command, but some issues with + # argparse across different versions of Python (2.7.9) made it hard. + args, unknown = parser.parse_known_args(args=cmdline_args) + if len(unknown) == 0: + raise SystemExit('No task specified') + task_name = unknown[0] + if task_name not in task_names: + error_task_names(task_name, task_names) + + task_cls = Register.get_task_cls(task_name) + + # Add a subparser to parse task-specific arguments + subparsers = parser.add_subparsers(dest='command') + subparser = subparsers.add_parser(task_name) + + # Add both task and global params here so that we can support both: + # test.py --global-param xyz Test --n 42 + # test.py Test --n 42 --global-param xyz + add_global_parameters(subparser) + add_task_parameters(subparser, task_cls) + + # Workaround for bug in argparse for Python 2.7.9 + # See https://mail.python.org/pipermail/python-dev/2015-January/137699.html + subargs = parser.parse_args(args=cmdline_args) + for key, value in vars(subargs).items(): + if value: # Either True (for boolean args) or non-None (everything else) + setattr(args, key, value) # Notice that this is not side effect free because it might set global params - task = task_cls.from_str_params(params, Register.get_global_params()) + set_global_parameters(args) + task_params = get_task_parameters(task_cls, args) - return [task] + return [task_cls(**task_params)] def parse(self, cmdline_args=None, main_task_cls=None): return self.parse_task(cmdline_args, main_task_cls) class DynamicArgParseInterface(ArgParseInterface): - ''' Uses --module as a way to load modules dynamically + """ + Uses --module as a way to load modules dynamically Usage: - python whatever.py --module foo_module FooTask --blah xyz --x 123 - This will dynamically import foo_module and then try to create FooTask from this - ''' + .. code-block:: console + + python whatever.py --module foo_module FooTask --blah xyz --x 123 + + This will dynamically import foo_module and then try to create FooTask from this. + """ def parse(self, cmdline_args=None, main_task_cls=None): - parser = ErrorWrappedArgumentParser() + parser = argparse.ArgumentParser() - self.add_global_parameters(parser) + add_global_parameters(parser) args, unknown = parser.parse_known_args(args=cmdline_args) module = args.module @@ -342,35 +308,35 @@ def parse(self, cmdline_args=None, main_task_cls=None): class PassThroughOptionParser(optparse.OptionParser): - ''' + """ An unknown option pass-through implementation of OptionParser. - When unknown arguments are encountered, bundle with largs and try again, - until rargs is depleted. + When unknown arguments are encountered, bundle with largs and try again, until rargs is depleted. sys.exit(status) will still be called if a known argument is passed incorrectly (e.g. missing arguments or bad argument types, etc.) - ''' + """ + def _process_args(self, largs, rargs, values): while rargs: try: optparse.OptionParser._process_args(self, largs, rargs, values) - except (optparse.BadOptionError, optparse.AmbiguousOptionError), e: + except (optparse.BadOptionError, optparse.AmbiguousOptionError) as e: largs.append(e.opt_str) class OptParseInterface(Interface): - ''' Supported for legacy reasons where it's necessary to interact with an existing parser. + """ + Supported for legacy reasons where it's necessary to interact with an existing parser. Takes the task using --task. All parameters to all possible tasks will be defined globally in a big unordered soup. - ''' + """ + def __init__(self, existing_optparse): self.__existing_optparse = existing_optparse def parse(self, cmdline_args=None, main_task_cls=None): - global_params = list(Register.get_global_params()) - parser = PassThroughOptionParser() def add_task_option(p): @@ -379,26 +345,7 @@ def add_task_option(p): else: p.add_option('--task', help='Task to run (one of %s)' % Register.tasks_str()) - def _add_parameter(parser, param_name, param): - description = [param_name] - if param.description: - description.append(param.description) - if param.has_value: - description.append(" [default: %s]" % (param.value,)) - - if param.is_list: - action = "append" - elif param.is_boolean: - action = "store_true" - else: - action = "store" - parser.add_option('--' + param_name.replace('_', '-'), - help=' '.join(description), - default=None, - action=action) - - for param_name, param in global_params: - _add_parameter(parser, param_name, param) + add_global_parameters(parser, optparse=True) add_task_option(parser) options, args = parser.parse_args(args=cmdline_args) @@ -413,34 +360,43 @@ def _add_parameter(parser, param_name, param): task_cls = Register.get_task_cls(task_cls_name) # Register all parameters as a big mess - params = task_cls.get_nonglobal_params() - - for param_name, param in global_params: - _add_parameter(parser, param_name, param) - - for param_name, param in params: - _add_parameter(parser, param_name, param) + add_global_parameters(parser, optparse=True) + add_task_parameters(parser, task_cls, optparse=True) # Parse and run options, args = parser.parse_args(args=cmdline_args) - params = {} - for k, v in vars(options).iteritems(): - if k != 'task': - params[k] = v + set_global_parameters(options) + task_params = get_task_parameters(task_cls, options) - task = task_cls.from_str_params(params, global_params) + return [task_cls(**task_params)] - return [task] +def load_task(module, task_name, params_str): + """ + Imports task dynamically given a module and a task name. + """ + __import__(module) + task_cls = Register.get_task_cls(task_name) + return task_cls.from_str_params(params_str) -def run(cmdline_args=None, existing_optparse=None, use_optparse=False, main_task_cls=None, worker_scheduler_factory=None, use_dynamic_argparse=False): - ''' Run from cmdline. - The default parser uses argparse. - However for legacy reasons we support optparse that optionally allows for - overriding an existing option parser with new args. - ''' +def run(cmdline_args=None, existing_optparse=None, use_optparse=False, main_task_cls=None, + worker_scheduler_factory=None, use_dynamic_argparse=False, local_scheduler=False): + """ + Run from cmdline. + + The default parser uses argparse however, for legacy reasons, + we support optparse that optionally allows for overriding an existing option parser with new args. + + :param cmdline_args: + :param existing_optparse: + :param use_optparse: + :param main_task_cls: + :param worker_scheduler_factory: + :param use_dynamic_argparse: + :param local_scheduler: + """ if use_optparse: interface = OptParseInterface(existing_optparse) elif use_dynamic_argparse: @@ -448,21 +404,34 @@ def run(cmdline_args=None, existing_optparse=None, use_optparse=False, main_task else: interface = ArgParseInterface() tasks = interface.parse(cmdline_args, main_task_cls=main_task_cls) - return interface.run(tasks, worker_scheduler_factory) + override_defaults = {} + if local_scheduler: + override_defaults['local_scheduler'] = True + return interface.run(tasks, worker_scheduler_factory, override_defaults=override_defaults) def build(tasks, worker_scheduler_factory=None, **env_params): - ''' Run internally, bypassing the cmdline parsing. + """ + Run internally, bypassing the cmdline parsing. Useful if you have some luigi code that you want to run internally. - Example - luigi.build([MyTask1(), MyTask2()], local_scheduler=True) + Example: + + .. code-block:: python + + luigi.build([MyTask1(), MyTask2()], local_scheduler=True) One notable difference is that `build` defaults to not using the identical process lock. Otherwise, `build` would only be callable once from each process. - ''' - if "no_lock" not in env_params and "lock" not in env_params: + + :param tasks: + :param worker_scheduler_factory: + :param env_params: + :return: + """ + if "no_lock" not in env_params: + # TODO(erikbern): should we really override args here? env_params["no_lock"] = True - env_params["lock"] = False + Interface.run(tasks, worker_scheduler_factory, env_params) diff --git a/luigi/lock.py b/luigi/lock.py index 4171e3d436..5d56ff3c70 100644 --- a/luigi/lock.py +++ b/luigi/lock.py @@ -12,13 +12,16 @@ # License for the specific language governing permissions and limitations under # the License. -import os import hashlib +import os def getpcmd(pid): - ''' Returns command of process - ''' + """ + Returns command of process. + + :param pid: + """ cmd = 'ps -p %s -o command=' % (pid,) p = os.popen(cmd, 'r') return p.readline().strip() @@ -35,20 +38,21 @@ def get_info(pid_dir): def acquire_for(pid_dir, num_available=1): - ''' Makes sure the process is only run once at the same time with the same name. + """ + Makes sure the process is only run once at the same time with the same name. Notice that we since we check the process name, different parameters to the same command can spawn multiple processes at the same time, i.e. running "/usr/bin/my_process" does not prevent anyone from launching "/usr/bin/my_process --foo bar". - ''' + """ my_pid, my_cmd, pid_file = get_info(pid_dir) # Check if there is a pid file corresponding to this name if not os.path.exists(pid_dir): os.mkdir(pid_dir) - os.chmod(pid_dir, 0777) + os.chmod(pid_dir, 0o777) pids = set() pid_cmds = {} @@ -78,6 +82,6 @@ def acquire_for(pid_dir, num_available=1): else: s = os.stat(pid_file) if os.getuid() == s.st_uid: - os.chmod(pid_file, s.st_mode | 0777) + os.chmod(pid_file, s.st_mode | 0o777) return True diff --git a/luigi/mock.py b/luigi/mock.py index ec61f1d485..c954ee847d 100644 --- a/luigi/mock.py +++ b/luigi/mock.py @@ -12,17 +12,19 @@ # License for the specific language governing permissions and limitations under # the License. +import multiprocessing +import os import StringIO -import target import sys -import os + import luigi.util -import multiprocessing +import target class MockFileSystem(target.FileSystem): - """MockFileSystem inspects/modifies _data to simulate - file system operations""" + """ + MockFileSystem inspects/modifies _data to simulate file system operations. + """ _data = None def get_all_data(self): @@ -38,9 +40,11 @@ def exists(self, path): return MockFile(path).exists() def remove(self, path, recursive=True, skip_trash=True): - """Removes the given mockfile. skip_trash doesn't have any meaning.""" + """ + Removes the given mockfile. skip_trash doesn't have any meaning. + """ if recursive: - to_delete=[] + to_delete = [] for s in self.get_all_data().keys(): if s.startswith(path): to_delete.append(s) @@ -50,13 +54,16 @@ def remove(self, path, recursive=True, skip_trash=True): self.get_all_data().pop(path) def listdir(self, path): - """listdir does a prefix match of self.get_all_data(), but - doesn't yet support globs""" + """ + listdir does a prefix match of self.get_all_data(), but doesn't yet support globs. + """ return [s for s in self.get_all_data().keys() if s.startswith(path)] - def mkdir(self, path): - """mkdir is a noop""" + def mkdir(self, path, parents=True, raise_if_exists=False): + """ + mkdir is a noop. + """ pass def clear(self): @@ -80,9 +87,6 @@ def rename(self, path, fail_if_exists=False): contents = self.fs.get_all_data().pop(self._fn) self.fs.get_all_data()[path] = contents - def move_dir(self, path): - self.move(path, raise_if_exists=True) - @property def path(self): return self._fn @@ -92,6 +96,7 @@ def open(self, mode): class StringBuffer(StringIO.StringIO): # Just to be able to do writing + reading from the same buffer + def write(self2, data): if self._mirror_on_stderr: self2.seek(-1, os.SEEK_END) @@ -105,8 +110,8 @@ def close(self2): self.fs.get_all_data()[fn] = self2.getvalue() StringIO.StringIO.close(self2) - def __exit__(self, type, value, traceback): - if not type: + def __exit__(self, exc_type, exc_val, exc_tb): + if not exc_type: self.close() def __enter__(self): @@ -119,7 +124,9 @@ def __enter__(self): def skip(func): - """ Sort of a substitute for unittest.skip*, which is 2.7+ """ + """ + Sort of a substitute for unittest.skip*, which is 2.7+. + """ def wrapper(): pass return wrapper diff --git a/luigi/mrrunner.py b/luigi/mrrunner.py index ea9c2f37f6..0a06d314fa 100644 --- a/luigi/mrrunner.py +++ b/luigi/mrrunner.py @@ -14,22 +14,25 @@ # License for the specific language governing permissions and limitations under # the License. -"""The hadoop runner. +""" +The hadoop runner. This module contains the main() method which will be used to run the mapper and reducer on the Hadoop nodes. """ +import cPickle as pickle +import logging import os import sys import tarfile -import cPickle as pickle -import logging import traceback class Runner(object): - """Run the mapper or reducer on hadoop nodes.""" + """ + Run the mapper or reducer on hadoop nodes. + """ def __init__(self, job=None): self.extract_packages_archive() @@ -38,11 +41,11 @@ def __init__(self, job=None): def run(self, kind, stdin=sys.stdin, stdout=sys.stdout): if kind == "map": - self.job._run_mapper(stdin, stdout) + self.job.run_mapper(stdin, stdout) elif kind == "combiner": - self.job._run_combiner(stdin, stdout) + self.job.run_combiner(stdin, stdout) elif kind == "reduce": - self.job._run_reducer(stdin, stdout) + self.job.run_reducer(stdin, stdout) else: raise Exception('weird command: %s' % kind) @@ -63,8 +66,9 @@ def print_exception(exc): print >> sys.stderr, 'luigi-exc-hex=%s' % tb.encode('hex') -def main(args=sys.argv, stdin=sys.stdin, stdout=sys.stdout, print_exception=print_exception): - """Run either the mapper or the reducer from the class instance in the file "job-instance.pickle". +def main(args=None, stdin=sys.stdin, stdout=sys.stdout, print_exception=print_exception): + """ + Run either the mapper or the reducer from the class instance in the file "job-instance.pickle". Arguments: @@ -73,10 +77,10 @@ def main(args=sys.argv, stdin=sys.stdin, stdout=sys.stdout, print_exception=prin try: # Set up logging. logging.basicConfig(level=logging.WARN) - - kind = args[1] + + kind = args is not None and args[1] or sys.argv[1] Runner().run(kind, stdin=stdin, stdout=stdout) - except Exception, exc: + except Exception as exc: # Dump encoded data that we will try to fetch using mechanize print_exception(exc) raise diff --git a/luigi/notifications.py b/luigi/notifications.py index b7435d2fdd..9a4652af91 100644 --- a/luigi/notifications.py +++ b/luigi/notifications.py @@ -1,7 +1,23 @@ -import sys +# Copyright (c) 2015 Spotify AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + import logging import socket +import sys + from luigi import configuration + logger = logging.getLogger("luigi-interface") @@ -78,6 +94,7 @@ def send_email_ses(config, sender, subject, message, recipients, image_png): source=msg_root['From'], destinations=msg_root['To']) + def send_email_sendgrid(config, sender, subject, message, recipients, image_png): import sendgrid client = sendgrid.SendGridClient(config.get('email', 'SENDGRID_USERNAME', None), @@ -96,7 +113,10 @@ def send_email_sendgrid(config, sender, subject, message, recipients, image_png) client.send(to_send) + def send_email(subject, message, sender, recipients, image_png=None): + config = configuration.get_config() + subject = _prefix(subject) logger.debug("Emailing:\n" "-------------\n" @@ -108,12 +128,10 @@ def send_email(subject, message, sender, recipients, image_png=None): "-------------", recipients, sender, subject, message) if not recipients or recipients == (None,): return - if sys.stdout.isatty() or DEBUG: + if (sys.stdout.isatty() or DEBUG) and (not config.getboolean('email', 'force-send', False)): logger.info("Not sending email when running from a tty or in debug mode") return - config = configuration.get_config() - # Clean the recipients lists to allow multiple error-email addresses, comma # separated in client.cfg recipients_tmp = [] @@ -133,9 +151,10 @@ def send_email(subject, message, sender, recipients, image_png=None): def send_error_email(subject, message): - """ Sends an email to the configured error-email. + """ + Sends an email to the configured error-email. - If no error-email is configured, then a message is logged + If no error-email is configured, then a message is logged. """ config = configuration.get_config() receiver = config.get('core', 'error-email', None) @@ -155,8 +174,9 @@ def send_error_email(subject, message): def _prefix(subject): - """If the config has a special prefix for emails then this function adds - this prefix + """ + If the config has a special prefix for emails then this function adds + this prefix. """ config = configuration.get_config() email_prefix = config.get('core', 'email-prefix', None) diff --git a/luigi/parameter.py b/luigi/parameter.py index 5b96f829a4..246e557f93 100644 --- a/luigi/parameter.py +++ b/luigi/parameter.py @@ -12,41 +12,54 @@ # License for the specific language governing permissions and limitations under # the License. -import configuration import datetime import warnings -from ConfigParser import NoSectionError, NoOptionError +from ConfigParser import NoOptionError, NoSectionError + +import configuration +from deprecate_kwarg import deprecate_kwarg _no_value = object() class ParameterException(Exception): - """Base exception.""" + """ + Base exception. + """ pass class MissingParameterException(ParameterException): - """Exception signifying that there was a missing Parameter.""" + """ + Exception signifying that there was a missing Parameter. + """ pass class UnknownParameterException(ParameterException): - """Exception signifying that an unknown Parameter was supplied.""" + """ + Exception signifying that an unknown Parameter was supplied. + """ pass class DuplicateParameterException(ParameterException): - """Exception signifying that a Parameter was specified multiple times.""" + """ + Exception signifying that a Parameter was specified multiple times. + """ pass class UnknownConfigException(ParameterException): - """Exception signifying that the ``default_from_config`` for the Parameter could not be found.""" + """ + Exception signifying that the ``config_path`` for the Parameter could not be found. + """ pass class Parameter(object): - """An untyped Parameter + """ + An untyped Parameter Parameters are objects set on the Task class level to make it possible to parameterize tasks. For instance: @@ -63,19 +76,21 @@ class MyTask(luigi.Task): The ``config_path`` argument lets you specify a place where the parameter is read from config in case no value is provided. - Providing ``is_global=True`` changes the behavior of the parameter so that the value is shared - across all instances of the task. Global parameters can be provided in several ways. In - falling order of precedence: + When a task is instantiated, it will first use any argument as the value of the parameter, eg. + if you instantiate a = TaskA(x=44) then a.x == 44. If this does not exist, it will use the value + of the Parameter object, which is defined on a class level. This will be resolved in this + order of falling priority: - * A value provided on the command line (eg. ``--my-global-value xyz``) - * A value provided via config (using the ``config_path`` argument) - * A default value set using the ``default`` flag. + * Any value provided on the command line on the class level (eg. ``--TaskA-param xyz``) + * Any value provided via config (using the ``config_path`` argument) + * Any default value set using the ``default`` flag. """ counter = 0 """non-atomically increasing counter used for ordering parameters.""" + @deprecate_kwarg('is_boolean', 'is_bool', False) def __init__(self, default=_no_value, is_list=False, is_boolean=False, is_global=False, significant=True, description=None, - config_path=None, default_from_config=None): + config_path=None): """ :param default: the default value for this parameter. This should match the type of the Parameter, i.e. ``datetime.date`` for ``DateParameter`` or ``int`` for @@ -84,10 +99,10 @@ def __init__(self, default=_no_value, is_list=False, is_boolean=False, is_global :param bool is_list: specify ``True`` if the parameter should allow a list of values rather than a single value. Default: ``False``. A list has an implicit default value of ``[]``. - :param bool is_boolean: specify ``True`` if the parameter is a boolean value. Default: - ``False``. Boolean's have an implicit default value of ``False``. + :param bool is_bool: specify ``True`` if the parameter is a bool value. Default: + ``False``. Bool's have an implicit default value of ``False``. :param bool is_global: specify ``True`` if the parameter is global (i.e. used by multiple - Tasks). Default: ``False``. + Tasks). Default: ``False``. DEPRECATED. :param bool significant: specify ``False`` if the parameter should not be treated as part of the unique identifier for a Task. An insignificant Parameter might also be used to specify a password or other sensitive information @@ -106,20 +121,21 @@ def __init__(self, default=_no_value, is_list=False, is_boolean=False, is_global self.__global = _no_value self.is_list = is_list - self.is_boolean = is_boolean and not is_list # Only BooleanParameter should ever use this. TODO(erikbern): should we raise some kind of exception? + self.is_bool = is_boolean and not is_list # Only BoolParameter should ever use this. TODO(erikbern): should we raise some kind of exception? self.is_global = is_global # It just means that the default value is exposed and you can override it - self.significant = significant # Whether different values for this parameter will differentiate otherwise equal tasks + self.significant = significant # Whether different values for this parameter will differentiate otherwise equal tasks - if default_from_config is not None: + if is_global: warnings.warn( - "Use config_path parameter, not default_from_config", + 'is_global is deprecated and will be removed. Please use either ' + ' (a) class level config (eg. --MyTask-my-param 42)' + ' (b) a separate Config class with global settings on it', DeprecationWarning, - stacklevel=2 - ) - config_path = default_from_config + stacklevel=2) if is_global and default == _no_value and config_path is None: raise ParameterException('Global parameters need default values') + self.description = description if config_path is not None and ('section' not in config_path or 'name' not in config_path): @@ -129,18 +145,21 @@ def __init__(self, default=_no_value, is_list=False, is_boolean=False, is_global self.counter = Parameter.counter # We need to keep track of this to get the order right (see Task class) Parameter.counter += 1 - def _get_value_from_config(self): + def _get_value_from_config(self, task_name, param_name): """Loads the default from the config. Returns _no_value if it doesn't exist""" - if not self.__config: + if self.__config: + section, name = self.__config['section'], self.__config['name'] + elif task_name is not None and param_name is not None: + section, name = task_name, param_name + else: return _no_value conf = configuration.get_config() - (section, name) = (self.__config['section'], self.__config['name']) try: value = conf.get(section, name) - except (NoSectionError, NoOptionError), e: + except (NoSectionError, NoOptionError) as e: return _no_value if self.is_list: @@ -148,38 +167,33 @@ def _get_value_from_config(self): else: return self.parse(value) + def _get_value(self, task_name=None, param_name=None): + values = [self.__global, self._get_value_from_config(task_name, param_name), self.__default] + for value in values: + if value != _no_value: + return value + else: + return _no_value + @property def has_value(self): - """``True`` if a default was specified or if config_path references a valid entry in the conf. + """ + ``True`` if a default was specified or if config_path references a valid entry in the conf. Note that "value" refers to the Parameter object itself - it can be either + 1. The default value for this parameter 2. A value read from the config 3. A global value Any Task instance can have its own value set that overrides this. """ - values = [self.__global, self._get_value_from_config(), self.__default] - for value in values: - if value != _no_value: - return True - else: - return False - - @property - def has_default(self): - """Don't use this function - see has_value instead""" - warnings.warn( - 'Use has_value rather than has_default. The meaning of ' - '"default" has changed', - DeprecationWarning, - stacklevel=2 - ) - return self.has_value + return self._get_value() != _no_value @property def value(self): - """The value for this Parameter. + """ + The value for this Parameter. This refers to any value defined by a default, a config option, or a global value. @@ -187,49 +201,36 @@ def value(self): :raises MissingParameterException: if a value is not set. :return: the parsed value. """ - values = [self.__global, self._get_value_from_config(), self.__default] - for value in values: - if value != _no_value: - return value - else: + value = self._get_value() + if value == _no_value: raise MissingParameterException("No default specified") + else: + return value - @property - def default(self): - warnings.warn( - 'Use value rather than default. The meaning of ' - '"default" has changed', - DeprecationWarning, - stacklevel=2 - ) - return self.value + def has_task_value(self, task_name, param_name): + return self._get_value(task_name, param_name) != _no_value + + def task_value(self, task_name, param_name): + value = self._get_value(task_name, param_name) + if value == _no_value: + raise MissingParameterException("No default specified") + else: + return value def set_global(self, value): - """Set the global value of this Parameter. + """ + Set the global value of this Parameter. :param value: the new global value. """ - assert self.is_global self.__global = value def reset_global(self): self.__global = _no_value - def set_default(self, value): - """Set the default value of this Parameter. - - :param value: the new default value. - """ - warnings.warn( - 'Use set_global rather than set_default. The meaning of ' - '"default" has changed', - DeprecationWarning, - stacklevel=2 - ) - self.__default = value - def parse(self, x): - """Parse an individual value from the input. + """ + Parse an individual value from the input. The default implementation is an identify (it returns ``x``), but subclasses should override this method for specialized parsing. This method is called by :py:meth:`parse_from_input` @@ -241,8 +242,9 @@ def parse(self, x): """ return x # default impl - def serialize(self, x): # opposite of parse - """Opposite of :py:meth:`parse`. + def serialize(self, x): # opposite of parse + """ + Opposite of :py:meth:`parse`. Converts the value ``x`` to a string. @@ -264,13 +266,13 @@ def parse_from_input(self, param_name, x): if not x: if self.has_value: return self.value - elif self.is_boolean: + elif self.is_bool: return False elif self.is_list: return [] else: - raise MissingParameterException("No value for '%s' (%s) submitted and no default value has been assigned." % \ - (param_name, "--" + param_name.replace('_', '-'))) + raise MissingParameterException("No value for '%s' (%s) submitted and no default value has been assigned." % + (param_name, "--" + param_name.replace('_', '-'))) elif self.is_list: return tuple(self.parse(p) for p in x) else: @@ -282,72 +284,174 @@ def serialize_to_input(self, x): else: return self.serialize(x) + def parser_dest(self, param_name, task_name, glob=False, is_without_section=False): + if self.is_global or is_without_section: + if glob: + return param_name + else: + return None + else: + if glob: + return task_name + '_' + param_name + else: + return param_name + + def add_to_cmdline_parser(self, parser, param_name, task_name, optparse=False, glob=False, is_without_section=False): + dest = self.parser_dest(param_name, task_name, glob, is_without_section=is_without_section) + if not dest: + return + flag = '--' + dest.replace('_', '-') + + description = [] + description.append('%s.%s' % (task_name, param_name)) + if self.description: + description.append(self.description) + if self.has_value: + description.append(" [default: %s]" % (self.value,)) + + if self.is_list: + action = "append" + elif self.is_bool: + action = "store_true" + else: + action = "store" + if optparse: + f = parser.add_option + else: + f = parser.add_argument + f(flag, + help=' '.join(description), + action=action, + dest=dest) + + def parse_from_args(self, param_name, task_name, args, params): + # Note: modifies arguments + dest = self.parser_dest(param_name, task_name, glob=False) + if dest is not None: + value = getattr(args, dest, None) + params[param_name] = self.parse_from_input(param_name, value) + + def set_global_from_args(self, param_name, task_name, args, is_without_section=False): + # Note: side effects + dest = self.parser_dest(param_name, task_name, glob=True, is_without_section=is_without_section) + if dest is not None: + value = getattr(args, dest, None) + if value is not None: + self.set_global(self.parse_from_input(param_name, value)) + else: + self.reset_global() + class DateHourParameter(Parameter): - """Parameter whose value is a :py:class:`~datetime.datetime` specified to the hour. + """ + Parameter whose value is a :py:class:`~datetime.datetime` specified to the hour. A DateHourParameter is a `ISO 8601 `_ formatted date and time specified to the hour. For example, ``2013-07-10T19`` specifies July 10, 2013 at 19:00. """ + date_format = '%Y-%m-%dT%H' # ISO 8601 is to use 'T' + def parse(self, s): """ Parses a string to a :py:class:`~datetime.datetime` using the format string ``%Y-%m-%dT%H``. """ # TODO(erikbern): we should probably use an internal class for arbitary # time intervals (similar to date_interval). Or what do you think? - return datetime.datetime.strptime(s, "%Y-%m-%dT%H") # ISO 8601 is to use 'T' + return datetime.datetime.strptime(s, self.date_format) def serialize(self, dt): """ Converts the datetime to a string usnig the format string ``%Y-%m-%dT%H``. """ - if dt is None: return str(dt) - return dt.strftime('%Y-%m-%dT%H') + if dt is None: + return str(dt) + return dt.strftime(self.date_format) + + +class DateMinuteParameter(DateHourParameter): + """ + Parameter whose value is a :py:class:`~datetime.datetime` specified to the minute. + + A DateMinuteParameter is a `ISO 8601 `_ formatted + date and time specified to the minute. For example, ``2013-07-10T19H07`` specifies July 10, 2013 at + 19:07. + """ + + date_format = '%Y-%m-%dT%HH%M' # ISO 8601 is to use 'T' and 'H' class DateParameter(Parameter): - """Parameter whose value is a :py:class:`~datetime.date`. + """ + Parameter whose value is a :py:class:`~datetime.date`. A DateParameter is a Date string formatted ``YYYY-MM-DD``. For example, ``2013-07-10`` specifies July 10, 2013. """ + def parse(self, s): """Parses a date string formatted as ``YYYY-MM-DD``.""" return datetime.date(*map(int, s.split('-'))) class IntParameter(Parameter): - """Parameter whose value is an ``int``.""" + """ + Parameter whose value is an ``int``. + """ + def parse(self, s): - """Parses an ``int`` from the string using ``int()``.""" + """ + Parses an ``int`` from the string using ``int()``. + """ return int(s) + class FloatParameter(Parameter): - """Parameter whose value is a ``float``.""" + """ + Parameter whose value is a ``float``. + """ + def parse(self, s): - """Parses a ``float`` from the string using ``float()``.""" + """ + Parses a ``float`` from the string using ``float()``. + """ return float(s) -class BooleanParameter(Parameter): - """A Parameter whose value is a ``bool``.""" - # TODO(erikbern): why do we call this "boolean" instead of "bool"? - # The integer parameter is called "int" so calling this "bool" would be - # more consistent, especially given the Python type names. + +class BoolParameter(Parameter): + """ + A Parameter whose value is a ``bool``. + """ + def __init__(self, *args, **kwargs): - """This constructor passes along args and kwargs to ctor for :py:class:`Parameter` but - specifies ``is_boolean=True``. """ - super(BooleanParameter, self).__init__(*args, is_boolean=True, **kwargs) + This constructor passes along args and kwargs to ctor for :py:class:`Parameter` but + specifies ``is_bool=True``. + """ + super(BoolParameter, self).__init__(*args, is_bool=True, **kwargs) def parse(self, s): - """Parses a ``boolean`` from the string, matching 'true' or 'false' ignoring case.""" + """ + Parses a ``bool`` from the string, matching 'true' or 'false' ignoring case. + """ return {'true': True, 'false': False}[str(s).lower()] +class BooleanParameter(BoolParameter): + + def __init__(self, *args, **kwargs): + warnings.warn( + 'BooleanParameter is deprecated, use BoolParameter instead', + DeprecationWarning, + stacklevel=2 + ) + super(BooleanParameter, self).__init__(*args, **kwargs) + + class DateIntervalParameter(Parameter): - """A Parameter whose value is a :py:class:`~luigi.date_interval.DateInterval`. + """ + A Parameter whose value is a :py:class:`~luigi.date_interval.DateInterval`. Date Intervals are specified using the ISO 8601 `Time Interval `_ notation. @@ -356,7 +460,8 @@ class DateIntervalParameter(Parameter): # Also gives some helpful interval algebra def parse(self, s): - """Parses a `:py:class:`~luigi.date_interval.DateInterval` from the input. + """ + Parses a `:py:class:`~luigi.date_interval.DateInterval` from the input. see :py:mod:`luigi.date_interval` for details on the parsing of DateIntervals. @@ -374,12 +479,13 @@ def parse(self, s): class TimeDeltaParameter(Parameter): - """Class that maps to timedelta using strings in any of the following forms: + """ + Class that maps to timedelta using strings in any of the following forms: - - ``n {w[eek[s]]|d[ay[s]]|h[our[s]]|m[inute[s]|s[second[s]]}`` (e.g. "1 week 2 days" or "1 h") + * ``n {w[eek[s]]|d[ay[s]]|h[our[s]]|m[inute[s]|s[second[s]]}`` (e.g. "1 week 2 days" or "1 h") Note: multiple arguments must be supplied in longest to shortest unit order - - ISO 8601 duration ``PnDTnHnMnS`` (each field optional, years and months not supported) - - ISO 8601 duration ``PnW`` + * ISO 8601 duration ``PnDTnHnMnS`` (each field optional, years and months not supported) + * ISO 8601 duration ``PnW`` See https://en.wikipedia.org/wiki/ISO_8601#Durations """ @@ -391,7 +497,7 @@ def _apply_regex(self, regex, input): if re_match: kwargs = {} has_val = False - for k,v in re_match.groupdict(default="0").items(): + for k, v in re_match.groupdict(default="0").iteritems(): val = int(v) has_val = has_val or val != 0 kwargs[k] = val @@ -401,11 +507,12 @@ def _apply_regex(self, regex, input): def _parseIso8601(self, input): def field(key): return "(?P<%s>\d+)%s" % (key, key[0].upper()) + def optional_field(key): return "(%s)?" % field(key) # A little loose: ISO 8601 does not allow weeks in combination with other fields, but this regex does (as does python timedelta) regex = "P(%s|%s(T%s)?)" % (field("weeks"), optional_field("days"), "".join([optional_field(key) for key in ["hours", "minutes", "seconds"]])) - return self._apply_regex(regex,input) + return self._apply_regex(regex, input) def _parseSimple(self, input): keys = ["weeks", "days", "hours", "minutes", "seconds"] @@ -415,7 +522,8 @@ def _parseSimple(self, input): return self._apply_regex(regex, input) def parse(self, input): - """Parses a time delta from the input. + """ + Parses a time delta from the input. See :py:class:`TimeDeltaParameter` for details on supported formats. """ diff --git a/luigi/postgres.py b/luigi/postgres.py index 0c2d9a02eb..e0d6999c83 100644 --- a/luigi/postgres.py +++ b/luigi/postgres.py @@ -14,8 +14,8 @@ import datetime import logging -import tempfile import re +import tempfile import luigi from luigi.contrib import rdbms @@ -29,9 +29,10 @@ except ImportError: logger.warning("Loading postgres module without psycopg2 installed. Will crash at runtime if postgres functionality is used.") + class MultiReplacer(object): - # TODO: move to misc/util module - """Object for one-pass replace of multiple words + """ + Object for one-pass replace of multiple words Substituted parts will not be matched against other replace patterns, as opposed to when using multipass replace. The order of the items in the replace_pairs input will dictate replacement precedence. @@ -40,17 +41,28 @@ class MultiReplacer(object): replace_pairs -- list of 2-tuples which hold strings to be replaced and replace string Usage: - >>> replace_pairs = [("a", "b"), ("b", "c")] - >>> MultiReplacer(replace_pairs)("abcd") - 'bccd' - >>> replace_pairs = [("ab", "x"), ("a", "x")] - >>> MultiReplacer(replace_pairs)("ab") - 'x' - >>> replace_pairs.reverse() - >>> MultiReplacer(replace_pairs)("ab") - 'xb' + + .. code-block:: python + + >>> replace_pairs = [("a", "b"), ("b", "c")] + >>> MultiReplacer(replace_pairs)("abcd") + 'bccd' + >>> replace_pairs = [("ab", "x"), ("a", "x")] + >>> MultiReplacer(replace_pairs)("ab") + 'x' + >>> replace_pairs.reverse() + >>> MultiReplacer(replace_pairs)("ab") + 'xb' """ +# TODO: move to misc/util module + def __init__(self, replace_pairs): + """ + Initializes a MultiReplacer instance. + + :param replace_pairs: list of 2-tuples which hold strings to be replaced and replace string. + :type replace_pairs: tuple + """ replace_list = list(replace_pairs) # make a copy in case input is iterable self._replace_dict = dict(replace_list) pattern = '|'.join(re.escape(x) for x, y in replace_list) @@ -78,9 +90,11 @@ def __call__(self, search_string): class PostgresTarget(luigi.Target): - """Target for a resource in Postgres. + """ + Target for a resource in Postgres. - This will rarely have to be directly instantiated by the user""" + This will rarely have to be directly instantiated by the user. + """ marker_table = luigi.configuration.get_config().get('postgres', 'marker-table', 'table_updates') # Use DB side timestamps or client side timestamps in the marker_table @@ -108,10 +122,12 @@ def __init__(self, host, database, user, password, table, update_id): self.update_id = update_id def touch(self, connection=None): - """Mark this update as complete. + """ + Mark this update as complete. Important: If the marker table doesn't exist, the connection transaction will be aborted - and the connection reset. Then the marker table will be created. + and the connection reset. + Then the marker table will be created. """ self.create_marker_table() @@ -125,14 +141,14 @@ def touch(self, connection=None): """INSERT INTO {marker_table} (update_id, target_table) VALUES (%s, %s) """.format(marker_table=self.marker_table), - (self.update_id, self.table)) + (self.update_id, self.table)) else: connection.cursor().execute( - """INSERT INTO {marker_table} (update_id, target_table, inserted) + """INSERT INTO {marker_table} (update_id, target_table, inserted) VALUES (%s, %s, %s); """.format(marker_table=self.marker_table), - (self.update_id, self.table, - datetime.datetime.now())) + (self.update_id, self.table, + datetime.datetime.now())) # make sure update is properly marked assert self.exists(connection) @@ -146,10 +162,10 @@ def exists(self, connection=None): cursor.execute("""SELECT 1 FROM {marker_table} WHERE update_id = %s LIMIT 1""".format(marker_table=self.marker_table), - (self.update_id,) - ) + (self.update_id,) + ) row = cursor.fetchone() - except psycopg2.ProgrammingError, e: + except psycopg2.ProgrammingError as e: if e.pgcode == psycopg2.errorcodes.UNDEFINED_TABLE: row = None else: @@ -157,7 +173,9 @@ def exists(self, connection=None): return row is not None def connect(self): - "Get a psycopg2 connection object to the database where the table is" + """ + Get a psycopg2 connection object to the database where the table is. + """ connection = psycopg2.connect( host=self.host, port=self.port, @@ -168,9 +186,11 @@ def connect(self): return connection def create_marker_table(self): - """Create marker table if it doesn't exist. + """ + Create marker table if it doesn't exist. - Using a separate connection since the transaction might have to be reset""" + Using a separate connection since the transaction might have to be reset. + """ connection = self.connect() connection.autocommit = True cursor = connection.cursor() @@ -188,7 +208,7 @@ def create_marker_table(self): """.format(marker_table=self.marker_table) try: cursor.execute(sql) - except psycopg2.ProgrammingError, e: + except psycopg2.ProgrammingError as e: if e.pgcode == psycopg2.errorcodes.DUPLICATE_TABLE: pass else: @@ -209,19 +229,21 @@ class CopyToTable(rdbms.CopyToTable): To customize how to access data from an input task, override the `rows` method with a generator that yields each row as a tuple with fields ordered according to `columns`. - """ def rows(self): - """Return/yield tuples or lists corresponding to each row to be inserted """ + """ + Return/yield tuples or lists corresponding to each row to be inserted. + """ with self.input().open('r') as fobj: for line in fobj: yield line.strip('\n').split('\t') def map_column(self, value): - """Applied to each column of every row returned by `rows` + """ + Applied to each column of every row returned by `rows`. - Default behaviour is to escape special characters and identify any self.null_values + Default behaviour is to escape special characters and identify any self.null_values. """ if value in self.null_values: return '\N' @@ -230,11 +252,11 @@ def map_column(self, value): else: return default_escape(str(value)) - # everything below will rarely have to be overridden def output(self): - """Returns a PostgresTarget representing the inserted dataset. + """ + Returns a PostgresTarget representing the inserted dataset. Normally you don't override this. """ @@ -245,8 +267,7 @@ def output(self): password=self.password, table=self.table, update_id=self.update_id() - ) - + ) def copy(self, cursor, file): if isinstance(self.columns[0], basestring): @@ -258,7 +279,8 @@ def copy(self, cursor, file): cursor.copy_from(file, self.table, null='\N', sep=self.column_separator, columns=column_names) def run(self): - """Inserts data generated by rows() into target table. + """ + Inserts data generated by rows() into target table. If the target table doesn't exist, self.create_table will be called to attempt to create the table. @@ -291,7 +313,7 @@ def run(self): cursor = connection.cursor() self.init_copy(connection) self.copy(cursor, tmp_file) - except psycopg2.ProgrammingError, e: + except psycopg2.ProgrammingError as e: if e.pgcode == psycopg2.errorcodes.UNDEFINED_TABLE and attempt == 0: # if first attempt fails with "relation not found", try creating table logger.info("Creating table %s", self.table) diff --git a/luigi/process.py b/luigi/process.py index a5cdd05257..ef73293f9f 100644 --- a/luigi/process.py +++ b/luigi/process.py @@ -12,12 +12,13 @@ # License for the specific language governing permissions and limitations under # the License. -import os -import signal -import random import datetime import logging import logging.handlers +import os +import random +import signal + rootlogger = logging.getLogger() server_logger = logging.getLogger("luigi.server") @@ -110,7 +111,8 @@ def daemonize(cmd, pidfile=None, logdir=None, api_port=8082, address=None): def fork_linked_workers(num_processes): - """ Forks num_processes child processes. + """ + Forks num_processes child processes. Returns an id between 0 and num_processes - 1 for each child process. Will consume the parent process and kill it and all child processes as soon as one child exits with status 0 @@ -132,7 +134,6 @@ def shutdown_handler(signum=None, frame=None): os.waitpid(c, 0) except OSError: print "Child %d is already dead" % c - pass os._exit(0) # exit without calling exit handler again... sigs = [signal.SIGINT, signal.SIGTERM, signal.SIGQUIT] @@ -140,7 +141,7 @@ def shutdown_handler(signum=None, frame=None): signal.signal(s, shutdown_handler) signal.signal(s, shutdown_handler) signal.signal(s, shutdown_handler) - #haven't found a way to unregister: atexit.register(shutdown_handler) # + # haven't found a way to unregister: atexit.register(shutdown_handler) # def fork_child(child_id, attempt): child_pid = os.fork() @@ -161,7 +162,7 @@ def fork_child(child_id, attempt): assert len(children) == num_processes - while 1: + while True: pid, status = os.wait() if status != 0: # unclean exit, restart process diff --git a/luigi/rpc.py b/luigi/rpc.py index fce00e3c8c..4ad2c37992 100644 --- a/luigi/rpc.py +++ b/luigi/rpc.py @@ -12,25 +12,29 @@ # License for the specific language governing permissions and limitations under # the License. -import urllib -import urllib2 -import logging import json +import logging import time -from scheduler import Scheduler, PENDING +import urllib +import urllib2 + import configuration +from scheduler import PENDING, Scheduler logger = logging.getLogger('luigi-interface') # TODO: 'interface'? class RPCError(Exception): + def __init__(self, message, sub_exception=None): super(RPCError, self).__init__(message) self.sub_exception = sub_exception class RemoteScheduler(Scheduler): - ''' Scheduler proxy object. Talks to a RemoteSchedulerResponder ''' + """ + Scheduler proxy object. Talks to a RemoteSchedulerResponder. + """ def __init__(self, host='localhost', port=8082, connect_timeout=None): self._host = host @@ -93,7 +97,7 @@ def ping(self, worker): self._request('/api/ping', {'worker': worker}, attempts=1) def add_task(self, worker, task_id, status=PENDING, runnable=False, - deps=None, new_deps=None, expl=None, resources={},priority=0, + deps=None, new_deps=None, expl=None, resources={}, priority=0, family='', params={}): self._request('/api/add_task', { 'task_id': task_id, @@ -110,19 +114,11 @@ def add_task(self, worker, task_id, status=PENDING, runnable=False, }) def get_work(self, worker, host=None): - ''' Ugly work around for an older scheduler version, where get_work doesn't have a host argument. Try once passing - host to it, falling back to the old version. Should be removed once people have had time to update everything - ''' - try: - return self._request( - '/api/get_work', - {'worker': worker, 'host': host}, - log_exceptions=False, - attempts=1 - ) - except: - logger.info("get_work RPC call failed, is it possible that you need to update your scheduler?") - raise + return self._request( + '/api/get_work', + {'worker': worker, 'host': host}, + log_exceptions=False, + attempts=1) def graph(self): return self._request('/api/graph', {}) @@ -147,60 +143,3 @@ def fetch_error(self, task_id): def add_worker(self, worker, info): return self._request('/api/add_worker', {'worker': worker, 'info': info}) - - -class RemoteSchedulerResponder(object): - """ Use on the server side for responding to requests - - The kwargs are there for forwards compatibility in case workers add - new (optional) arguments. That way there's no dependency on the server - component when upgrading Luigi on the worker side. - - TODO(erikbern): what is this class actually used for? Other than an - unnecessary layer of indirection around central scheduler - """ - - def __init__(self, scheduler): - self._scheduler = scheduler - - def add_task(self, worker, task_id, status, runnable, deps, new_deps, expl, - resources=None, priority=0, family='', params={}, **kwargs): - return self._scheduler.add_task( - worker, task_id, status, runnable, deps, new_deps, expl, - resources, priority, family, params) - - def add_worker(self, worker, info, **kwargs): - return self._scheduler.add_worker(worker, info) - - def get_work(self, worker, host=None, **kwargs): - return self._scheduler.get_work(worker, host) - - def ping(self, worker, **kwargs): - return self._scheduler.ping(worker) - - def graph(self, **kwargs): - return self._scheduler.graph() - - index = graph - - def dep_graph(self, task_id, **kwargs): - return self._scheduler.dep_graph(task_id) - - def inverse_dep_graph(self, task_id, **kwargs): - return self._scheduler.inverse_dependencies(task_id) - - def task_list(self, status, upstream_status, **kwargs): - return self._scheduler.task_list(status, upstream_status) - - def worker_list(self, **kwargs): - return self._scheduler.worker_list() - - def task_search(self, task_str, **kwargs): - return self._scheduler.task_search(task_str) - - def fetch_error(self, task_id, **kwargs): - return self._scheduler.fetch_error(task_id) - - @property - def task_history(self): - return self._scheduler.task_history diff --git a/luigi/s3.py b/luigi/s3.py index d8af947bd4..2e60b141e1 100644 --- a/luigi/s3.py +++ b/luigi/s3.py @@ -17,17 +17,15 @@ import os.path import random import tempfile -import warnings import urlparse +import warnings +from ConfigParser import NoSectionError import configuration -from ConfigParser import NoSectionError +from luigi.format import FileWrapper from luigi.parameter import Parameter -from luigi.target import FileSystem -from luigi.target import FileSystemTarget -from luigi.target import FileSystemException +from luigi.target import FileSystem, FileSystemException, FileSystemTarget from luigi.task import ExternalTask -from luigi.format import FileWrapper logger = logging.getLogger('luigi-interface') @@ -166,7 +164,7 @@ def put_string(self, content, destination_s3_path): (bucket, key) = self._path_to_bucket_and_key(destination_s3_path) # grab and validate the bucket s3_bucket = self.s3.get_bucket(bucket, validate=True) - + # put the content s3_key = Key(s3_bucket) s3_key.key = key @@ -194,14 +192,14 @@ def put_multipart(self, local_path, destination_s3_path, part_size=67108864): # grab and validate the bucket s3_bucket = self.s3.get_bucket(bucket, validate=True) - # calculate the number of parts (int division). + # calculate the number of parts (int division). # use modulo to avoid float precision issues # for exactly-sized fits num_parts = \ (source_size / part_size) \ if source_size % part_size == 0 \ else (source_size / part_size) + 1 - + mp = None try: mp = s3_bucket.initiate_multipart_upload(key) @@ -211,9 +209,9 @@ def put_multipart(self, local_path, destination_s3_path, part_size=67108864): offset = part_size * i bytes = min(part_size, source_size - offset) with open(local_path, 'rb') as fp: - part_num = i+1 - logger.info('Uploading part %s/%s to %s' % \ - (part_num, num_parts, destination_s3_path)) + part_num = i + 1 + logger.info('Uploading part %s/%s to %s' % + (part_num, num_parts, destination_s3_path)) fp.seek(offset) mp.upload_part_from_file(fp, part_num=part_num, size=bytes) @@ -221,13 +219,12 @@ def put_multipart(self, local_path, destination_s3_path, part_size=67108864): mp.complete_upload() except: if mp: - logger.info('Canceling multipart s3 upload for %s' % destination_s3_path) + logger.info('Canceling multipart s3 upload for %s' % destination_s3_path) # cancel the upload so we don't get charged for # storage consumed by uploaded parts mp.cancel_upload() raise - def copy(self, source_path, destination_path): """ Copy an object from one S3 location to another. @@ -305,7 +302,7 @@ def _get_s3_config(self, key=None): except NoSectionError: return {} # So what ports etc can be read without us having to specify all dtypes - for k, v in config.items(): + for k, v in config.iteritems(): try: config[k] = int(v) except ValueError: @@ -330,6 +327,7 @@ class AtomicS3File(file): """ An S3 file that writes to a temp file and put to S3 on close. """ + def __init__(self, path, s3_client): self.__tmp_path = \ os.path.join(tempfile.gettempdir(), @@ -353,7 +351,9 @@ def __del__(self): os.remove(self.__tmp_path) def __exit__(self, exc_type, exc, traceback): - " Close/commit the file if there are no exception " + """ + Close/commit the file if there are no exception. + """ if exc_type: return return file.__exit__(self, exc_type, exc, traceback) @@ -467,20 +467,34 @@ class S3FlagTarget(S3Target): Defines a target directory with a flag-file (defaults to `_SUCCESS`) used to signify job success. - This checks for two things: that the path exists (just like the S3Target) - and that the _SUCCESS file exists within the directory. Because Hadoop - outputs into a directory and not a single file, the path is assume to be a - directory. + This checks for two things: + + * the path exists (just like the S3Target) + * the _SUCCESS file exists within the directory. + + Because Hadoop outputs into a directory and not a single file, + the path is assumed to be a directory. + + This is meant to be a handy alternative to AtomicS3File. - This is meant to be a handy alternative to AtomicS3File. The AtomicFile - approach can be burdensome for S3 since there are no directories, per se. - If we have 1,000,000 output files, then we have to rename 1,000,000 - objects. + The AtomicFile approach can be burdensome for S3 since there are no directories, per se. + + If we have 1,000,000 output files, then we have to rename 1,000,000 objects. """ fs = None def __init__(self, path, format=None, client=None, flag='_SUCCESS'): + """ + Initializes a S3FlagTarget. + + :param path: the directory where the files are stored. + :type path: str + :param client: + :type client: + :param flag: + :type flag: str + """ if path[-1] is not "/": raise ValueError("S3FlagTarget requires the path to be to a " "directory. It must end with a slash ( / ).") @@ -498,6 +512,7 @@ class S3EmrTarget(S3FlagTarget): """ Deprecated. Use :py:class:`S3FlagTarget` """ + def __init__(self, *args, **kwargs): warnings.warn("S3EmrTarget is deprecated. Please use S3FlagTarget") super(S3EmrTarget, self).__init__(*args, **kwargs) @@ -505,8 +520,7 @@ def __init__(self, *args, **kwargs): class S3PathTask(ExternalTask): """ - A external task that to require existence of - a path in S3. + A external task that to require existence of a path in S3. """ path = Parameter() @@ -516,7 +530,7 @@ def output(self): class S3EmrTask(ExternalTask): """ - An external task that requires the existence of EMR output in S3 + An external task that requires the existence of EMR output in S3. """ path = Parameter() @@ -526,7 +540,7 @@ def output(self): class S3FlagTask(ExternalTask): """ - An external task that requires the existence of EMR output in S3 + An external task that requires the existence of EMR output in S3. """ path = Parameter() flag = Parameter(default=None) diff --git a/luigi/scalding.py b/luigi/scalding.py index ebb22aa6ce..488b39645e 100644 --- a/luigi/scalding.py +++ b/luigi/scalding.py @@ -1,261 +1,19 @@ -import logging -import os -import re -import subprocess - -from luigi import LocalTarget -import configuration -import hadoop -import hadoop_jar - -logger = logging.getLogger('luigi-interface') - -""" -Scalding support for Luigi. - -Example configuration section in client.cfg: -[scalding] -# scala home directory, which should include a lib subdir with scala jars. -scala-home: /usr/share/scala - -# scalding home directory, which should include a lib subdir with -# scalding-*-assembly-* jars as built from the official Twitter build script. -scalding-home: /usr/share/scalding - -# provided dependencies, e.g. jars required for compiling but not executing -# scalding jobs. Currently requred jars: -# org.apache.hadoop/hadoop-core/0.20.2 -# org.slf4j/slf4j-log4j12/1.6.6 -# log4j/log4j/1.2.15 -# commons-httpclient/commons-httpclient/3.1 -# commons-cli/commons-cli/1.2 -# org.apache.zookeeper/zookeeper/3.3.4 -scalding-provided: /usr/share/scalding/provided - -# additional jars required. -scalding-libjars: /usr/share/scalding/libjars -""" - - -class ScaldingJobRunner(hadoop.JobRunner): - """JobRunner for `pyscald` commands. Used to run a ScaldingJobTask""" - - def __init__(self): - conf = configuration.get_config() - - default = os.environ.get('SCALA_HOME', '/usr/share/scala') - self.scala_home = conf.get('scalding', 'scala-home', default) - - default = os.environ.get('SCALDING_HOME', '/usr/share/scalding') - self.scalding_home = conf.get('scalding', 'scalding-home', default) - self.provided_dir = conf.get( - 'scalding', 'scalding-provided', os.path.join(default, 'provided')) - self.libjars_dir = conf.get( - 'scalding', 'scalding-libjars', os.path.join(default, 'libjars')) - - self.tmp_dir = LocalTarget(is_tmp=True) - - def _get_jars(self, path): - return [os.path.join(path, j) for j in os.listdir(path) - if j.endswith('.jar')] - - def get_scala_jars(self, include_compiler=False): - lib_dir = os.path.join(self.scala_home, 'lib') - jars = [os.path.join(lib_dir, 'scala-library.jar')] - - # additional jar for scala 2.10 only - reflect = os.path.join(lib_dir, 'scala-reflect.jar') - if os.path.exists(reflect): - jars.append(reflect) - - if include_compiler: - jars.append(os.path.join(lib_dir, 'scala-compiler.jar')) - - return jars - - def get_scalding_jars(self): - lib_dir = os.path.join(self.scalding_home, 'lib') - return self._get_jars(lib_dir) - - def get_scalding_core(self): - lib_dir = os.path.join(self.scalding_home, 'lib') - for j in os.listdir(lib_dir): - if j.startswith('scalding-core-'): - p = os.path.join(lib_dir, j) - logger.debug('Found scalding-core: %s', p) - return p - raise hadoop.HadoopJobError('Coudl not find scalding-core.') - - def get_provided_jars(self): - return self._get_jars(self.provided_dir) - - def get_libjars(self): - return self._get_jars(self.libjars_dir) - - def get_tmp_job_jar(self, source): - job_name = os.path.basename(os.path.splitext(source)[0]) - return os.path.join(self.tmp_dir.path, job_name + '.jar') - - def get_build_dir(self, source): - build_dir = os.path.join(self.tmp_dir.path, 'build') - return build_dir - - def get_job_class(self, source): - # find name of the job class - # usually the one that matches file name or last class that extends Job - job_name = os.path.splitext(os.path.basename(source))[0] - package = None - job_class = None - for l in open(source).readlines(): - p = re.search(r'package\s+([^\s\(]+)', l) - if p: - package = p.groups()[0] - p = re.search(r'class\s+([^\s\(]+).*extends\s+.*Job', l) - if p: - job_class = p.groups()[0] - if job_class == job_name: - break - if job_class: - if package: - job_class = package + '.' + job_class - logger.debug('Found scalding job class: %s', job_class) - return job_class - else: - raise hadoop.HadoopJobError('Coudl not find scalding job class.') - - def build_job_jar(self, job): - job_jar = job.jar() - if job_jar: - if not os.path.exists(job_jar): - logger.error("Can't find jar: {0}, full path {1}".format( - job_jar, os.path.abspath(job_jar))) - raise Exception("job jar does not exist") - if not job.job_class(): - logger.error("Undefined job_class()") - raise Exception("Undefined job_class()") - return job_jar - - job_src = job.source() - if not job_src: - logger.error("Both source() and jar() undefined") - raise Exception("Both source() and jar() undefined") - if not os.path.exists(job_src): - logger.error("Can't find source: {0}, full path {1}".format( - job_src, os.path.abspath(job_src))) - raise Exception("job source does not exist") - - job_src = job.source() - job_jar = self.get_tmp_job_jar(job_src) - - build_dir = self.get_build_dir(job_src) - if not os.path.exists(build_dir): - os.makedirs(build_dir) - - classpath = ':'.join(filter(None, - self.get_scalding_jars() + - self.get_provided_jars() + - self.get_libjars() + - job.extra_jars())) - scala_cp = ':'.join(self.get_scala_jars(include_compiler=True)) - - # compile scala source - arglist = ['java', '-cp', scala_cp, 'scala.tools.nsc.Main', - '-classpath', classpath, - '-d', build_dir, job_src] - logger.info('Compiling scala source: %s', ' '.join(arglist)) - subprocess.check_call(arglist) - - # build job jar file - arglist = ['jar', 'cf', job_jar, '-C', build_dir, '.'] - logger.info('Building job jar: %s', ' '.join(arglist)) - subprocess.check_call(arglist) - return job_jar - - def run_job(self, job): - job_jar = self.build_job_jar(job) - jars = [job_jar] + self.get_libjars() + job.extra_jars() - scalding_core = self.get_scalding_core() - libjars = ','.join(filter(None, jars)) - arglist = ['hadoop', 'jar', scalding_core, '-libjars', libjars] - arglist += ['-D%s' % c for c in job.jobconfs()] - - job_class = job.job_class() or self.get_job_class(job.source()) - arglist += [job_class, '--hdfs'] - - # scalding does not parse argument with '=' properly - arglist += ['--name', job.task_id.replace('=', ':')] - - (tmp_files, job_args) = hadoop_jar.fix_paths(job) - arglist += job_args - - env = os.environ.copy() - jars.append(scalding_core) - hadoop_cp = ':'.join(filter(None, jars)) - env['HADOOP_CLASSPATH'] = hadoop_cp - logger.info("Submitting Hadoop job: HADOOP_CLASSPATH=%s %s", - hadoop_cp, ' '.join(arglist)) - hadoop.run_and_track_hadoop_job(arglist, env=env) - - for a, b in tmp_files: - a.move(b) - - -class ScaldingJobTask(hadoop.BaseHadoopJobTask): - """A job task for Scalding that define a scala source and (optional) main - method - - requires() should return a dictionary where the keys are Scalding argument - names and values are lists of paths. For example: - {'input1': ['A', 'B'], 'input2': ['C']} => --input1 A B --input2 C - """ - - def relpath(self, current_file, rel_path): - """Compute path given current file and relative path""" - script_dir = os.path.dirname(os.path.abspath(current_file)) - rel_path = os.path.abspath(os.path.join(script_dir, rel_path)) - return rel_path - - def source(self): - """Path to the scala source for this Scalding Job - Either one of source() or jar() must be specified. - """ - return None - - def jar(self): - """Path to the jar file for this Scalding Job - Either one of source() or jar() must be specified. - """ - return None - - def extra_jars(self): - """Extra jars for building and running this Scalding Job""" - return [] - - def job_class(self): - """optional main job class for this Scalding Job""" - return None - - def job_runner(self): - return ScaldingJobRunner() - - def atomic_output(self): - """If True, then rewrite output arguments to be temp locations and - atomically move them into place after the job finishes""" - return True - - def requires(self): - return {} - - def job_args(self): - """Extra arguments to pass to the Scalding job""" - return [] - - def args(self): - """returns an array of args to pass to the job.""" - arglist = [] - for k, v in self.requires_hadoop().iteritems(): - arglist.append('--' + k) - arglist.extend([t.output().path for t in v]) - arglist.extend(['--output', self.output()]) - arglist.extend(self.job_args()) - return arglist +# Copyright (c) 2015 Spotify AB +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. + +import warnings + +from luigi.contrib.scalding import * + +warnings.warn("luigi.scalding has now moved to luigi.contrib.scalding", DeprecationWarning, stacklevel=3) diff --git a/luigi/scheduler.py b/luigi/scheduler.py index bfefc61ded..e885509ce2 100644 --- a/luigi/scheduler.py +++ b/luigi/scheduler.py @@ -13,24 +13,27 @@ # the License. import collections +import cPickle as pickle import datetime import functools -import notifications -import os +import itertools import logging +import os import time -import cPickle as pickle + +import notifications import task_history as history -logger = logging.getLogger("luigi.server") +from task_status import DISABLED, DONE, FAILED, PENDING, RUNNING, SUSPENDED, UNKNOWN -from task_status import PENDING, FAILED, DONE, RUNNING, SUSPENDED, UNKNOWN, DISABLED +logger = logging.getLogger("luigi.server") class Scheduler(object): - ''' Abstract base class + """ + Abstract base class. Note that the methods all take string arguments, not Task objects... - ''' + """"" add_task = NotImplemented get_work = NotImplemented ping = NotImplemented @@ -47,7 +50,7 @@ class Scheduler(object): UPSTREAM_FAILED, UPSTREAM_DISABLED, ) -UPSTREAM_SEVERITY_KEY = lambda st: UPSTREAM_SEVERITY_ORDER.index(st) +UPSTREAM_SEVERITY_KEY = UPSTREAM_SEVERITY_ORDER.index STATUS_TO_UPSTREAM_MAP = { FAILED: UPSTREAM_FAILED, RUNNING: UPSTREAM_RUNNING, @@ -58,8 +61,10 @@ class Scheduler(object): # We're passing around this config a lot, so let's put it on an object SchedulerConfig = collections.namedtuple('SchedulerConfig', [ - 'retry_delay', 'remove_delay', 'worker_disconnect_delay', - 'disable_failures', 'disable_window', 'disable_persist', 'disable_time']) + 'retry_delay', 'remove_delay', 'worker_disconnect_delay', + 'disable_failures', 'disable_window', 'disable_persist', 'disable_time', + 'max_shown_tasks', +]) def fix_time(x): @@ -72,27 +77,32 @@ def fix_time(x): class Failures(object): - """ This class tracks the number of failures in a given time window + """ + This class tracks the number of failures in a given time window. Failures added are marked with the current timestamp, and this class counts the number of failures in a sliding time window ending at the present. - """ def __init__(self, window): - """ Initialize with the given window + """ + Initialize with the given window. - :param window: how long to track failures for, as a float (number of seconds) + :param window: how long to track failures for, as a float (number of seconds). """ self.window = window self.failures = collections.deque() def add_failure(self): - """ Add a failure event with the current timestamp """ + """ + Add a failure event with the current timestamp. + """ self.failures.append(time.time()) def num_failures(self): - """ Return the number of failures in the window """ + """ + Return the number of failures in the window. + """ min_time = time.time() - self.window while self.failures and fix_time(self.failures[0]) < min_time: @@ -101,14 +111,24 @@ def num_failures(self): return len(self.failures) def clear(self): - """ Clear the failure queue """ + """ + Clear the failure queue. + """ self.failures.clear() +def _get_default(x, default): + if x is not None: + return x + else: + return default + + class Task(object): - def __init__(self, id, status, deps, resources={}, priority=0, family='', params={}, + + def __init__(self, task_id, status, deps, resources=None, priority=0, family='', params=None, disable_failures=None, disable_window=None): - self.id = id + self.id = task_id self.stakeholders = set() # workers ids that are somehow related to this task (i.e. don't prune while any of these workers are still active) self.workers = set() # workers ids that can perform task - task is 'BROKEN' if none of these workers are active if deps is None: @@ -123,9 +143,9 @@ def __init__(self, id, status, deps, resources={}, priority=0, family='', params self.time_running = None # Timestamp when picked up by worker self.expl = None self.priority = priority - self.resources = resources + self.resources = _get_default(resources, {}) self.family = family - self.params = params + self.params = _get_default(params, {}) self.disable_failures = disable_failures self.failures = Failures(disable_window) self.scheduler_disable_time = None @@ -142,83 +162,14 @@ def has_excessive_failures(self): def can_disable(self): return self.disable_failures is not None - def re_enable(self): - self.scheduler_disable_time = None - self.status = FAILED - self.failures.clear() - - def set_status(self, new_status, config): - # not sure why we have SUSPENDED, as it can never be set - if new_status == SUSPENDED: - new_status = PENDING - - if new_status == DISABLED and self.status == RUNNING: - return - - if self.status == DISABLED: - if new_status == DONE: - self.re_enable() - - # don't allow workers to override a scheduler disable - elif self.scheduler_disable_time is not None: - return - - if new_status == FAILED and self.can_disable(): - self.add_failure() - if self.has_excessive_failures(): - self.scheduler_disable_time = time.time() - new_status = DISABLED - notifications.send_error_email( - 'Luigi Scheduler: DISABLED {task} due to excessive failures'.format(task=self.id), - '{task} failed {failures} times in the last {window} seconds, so it is being ' - 'disabled for {persist} seconds'.format( - failures=config.disable_failures, - task=self.id, - window=config.disable_window, - persist=config.disable_persist, - )) - elif new_status == DISABLED: - self.scheduler_disable_time = None - - self.status = new_status - - def prune(self, config): - remove = False - - # Mark tasks with no remaining active stakeholders for deletion - if not self.stakeholders: - if self.remove is None: - logger.info("Task %r has stakeholders %r but none remain connected -> will remove task in %s seconds", self.id, self.stakeholders, config.remove_delay) - self.remove = time.time() + config.remove_delay - - # If a running worker disconnects, tag all its jobs as FAILED and subject it to the same retry logic - if self.status == RUNNING and self.worker_running and self.worker_running not in self.stakeholders: - logger.info("Task %r is marked as running by disconnected worker %r -> marking as FAILED with retry delay of %rs", self.id, self.worker_running, config.retry_delay) - self.worker_running = None - self.set_status(FAILED, config) - self.retry = time.time() + config.retry_delay - - # Re-enable task after the disable time expires - if self.status == DISABLED and self.scheduler_disable_time: - if time.time() - fix_time(self.scheduler_disable_time) > config.disable_time: - self.re_enable() - - # Remove tasks that have no stakeholders - if self.remove and time.time() > self.remove: - logger.info("Removing task %r (no connected stakeholders)", self.id) - remove = True - - # Reset FAILED tasks to PENDING if max timeout is reached, and retry delay is >= 0 - if self.status == FAILED and config.retry_delay >= 0 and self.retry < time.time(): - self.set_status(PENDING, config) - - return remove - class Worker(object): - """ Structure for tracking worker activity and keeping their references """ - def __init__(self, id, last_active=None): - self.id = id + """ + Structure for tracking worker activity and keeping their references. + """ + + def __init__(self, worker_id, last_active=None): + self.id = worker_id self.reference = None # reference to the worker in the real world. (Currently a dict containing just the host) self.last_active = last_active # seconds since epoch self.started = time.time() # seconds since epoch @@ -242,16 +193,18 @@ def __str__(self): class SimpleTaskState(object): - ''' Keep track of the current state and handle persistance + """ + Keep track of the current state and handle persistance. The point of this class is to enable other ways to keep state, eg. by using a database These will be implemented by creating an abstract base class that this and other classes inherit from. - ''' + """ def __init__(self, state_path): self._state_path = state_path self._tasks = {} # map from id to a Task object + self._status_tasks = collections.defaultdict(dict) self._active_workers = {} # map from id to a Worker object def dump(self): @@ -276,40 +229,134 @@ def load(self): return self._tasks, self._active_workers = state + self._status_tasks = collections.defaultdict(dict) + for task in self._tasks.itervalues(): + self._status_tasks[task.status][task.id] = task # Convert from old format # TODO: this is really ugly, we need something more future-proof # Every time we add an attribute to the Worker class, this code needs to be updated for k, v in self._active_workers.iteritems(): if isinstance(v, float): - self._active_workers[k] = Worker(id=k, last_active=v) + self._active_workers[k] = Worker(worker_id=k, last_active=v) else: logger.info("No prior state file exists at %s. Starting with clean slate", self._state_path) - def get_active_tasks(self): - for task in self._tasks.itervalues(): - yield task + def get_active_tasks(self, status=None): + if status: + for task in self._status_tasks[status].itervalues(): + yield task + else: + for task in self._tasks.itervalues(): + yield task + + def get_running_tasks(self): + return self._status_tasks[RUNNING].itervalues() def get_pending_tasks(self): - for task in self._tasks.itervalues(): - if task.status in [PENDING, RUNNING]: - yield task + return itertools.chain.from_iterable(self._status_tasks[status].itervalues() + for status in [PENDING, RUNNING]) def get_task(self, task_id, default=None, setdefault=None): if setdefault: - return self._tasks.setdefault(task_id, setdefault) + task = self._tasks.setdefault(task_id, setdefault) + self._status_tasks[task.status][task.id] = task + return task else: return self._tasks.get(task_id, default) def has_task(self, task_id): return task_id in self._tasks + def re_enable(self, task, config=None): + task.scheduler_disable_time = None + task.failures.clear() + if config: + self.set_status(task, FAILED, config) + task.failures.clear() + + def set_status(self, task, new_status, config=None): + if new_status == FAILED: + assert config is not None + + # not sure why we have SUSPENDED, as it can never be set + if new_status == SUSPENDED: + new_status = PENDING + + if new_status == DISABLED and task.status == RUNNING: + return + + if task.status == DISABLED: + if new_status == DONE: + self.re_enable(task) + + # don't allow workers to override a scheduler disable + elif task.scheduler_disable_time is not None: + return + + if new_status == FAILED and task.can_disable(): + task.add_failure() + if task.has_excessive_failures(): + task.scheduler_disable_time = time.time() + new_status = DISABLED + notifications.send_error_email( + 'Luigi Scheduler: DISABLED {task} due to excessive failures'.format(task=task.id), + '{task} failed {failures} times in the last {window} seconds, so it is being ' + 'disabled for {persist} seconds'.format( + failures=config.disable_failures, + task=task.id, + window=config.disable_window, + persist=config.disable_persist, + )) + elif new_status == DISABLED: + task.scheduler_disable_time = None + + self._status_tasks[task.status].pop(task.id) + self._status_tasks[new_status][task.id] = task + task.status = new_status + + def prune(self, task, config): + remove = False + + # Mark tasks with no remaining active stakeholders for deletion + if not task.stakeholders: + if task.remove is None: + logger.info("Task %r has stakeholders %r but none remain connected -> will remove " + "task in %s seconds", task.id, task.stakeholders, config.remove_delay) + task.remove = time.time() + config.remove_delay + + # If a running worker disconnects, tag all its jobs as FAILED and subject it to the same retry logic + if task.status == RUNNING and task.worker_running and task.worker_running not in task.stakeholders: + logger.info("Task %r is marked as running by disconnected worker %r -> marking as " + "FAILED with retry delay of %rs", task.id, task.worker_running, + config.retry_delay) + task.worker_running = None + self.set_status(task, FAILED, config) + task.retry = time.time() + config.retry_delay + + # Re-enable task after the disable time expires + if task.status == DISABLED and task.scheduler_disable_time: + if time.time() - fix_time(task.scheduler_disable_time) > config.disable_time: + self.re_enable(task, config) + + # Remove tasks that have no stakeholders + if task.remove and time.time() > task.remove: + logger.info("Removing task %r (no connected stakeholders)", task.id) + remove = True + + # Reset FAILED tasks to PENDING if max timeout is reached, and retry delay is >= 0 + if task.status == FAILED and config.retry_delay >= 0 and task.retry < time.time(): + self.set_status(task, PENDING, config) + + return remove + def inactivate_tasks(self, delete_tasks): # The terminology is a bit confusing: we used to "delete" tasks when they became inactive, # but with a pluggable state storage, you might very well want to keep some history of # older tasks as well. That's why we call it "inactivate" (as in the verb) for task in delete_tasks: - self._tasks.pop(task) + task_obj = self._tasks.pop(task) + self._status_tasks[task_obj.status].pop(task) def get_active_workers(self, last_active_lt=None): for worker in self._active_workers.itervalues(): @@ -318,7 +365,7 @@ def get_active_workers(self, last_active_lt=None): yield worker def get_worker_ids(self): - return self._active_workers.keys() # only used for unit tests + return self._active_workers.keys() # only used for unit tests def get_worker(self, worker_id): return self._active_workers.setdefault(worker_id, Worker(worker_id)) @@ -335,22 +382,24 @@ def inactivate_workers(self, delete_workers): class CentralPlannerScheduler(Scheduler): - ''' Async scheduler that can handle multiple workers etc + """ + Async scheduler that can handle multiple workers, etc. Can be run locally or on a server (using RemoteScheduler + server.Server). - ''' + """ def __init__(self, retry_delay=900.0, remove_delay=600.0, worker_disconnect_delay=60.0, state_path='/var/lib/luigi-server/state.pickle', task_history=None, - resources=None, disable_persist=0, disable_window=0, disable_failures=None): - ''' + resources=None, disable_persist=0, disable_window=0, disable_failures=None, + max_shown_tasks=100000): + """ (all arguments are in seconds) Keyword Arguments: - retry_delay -- How long after a Task fails to try it again, or -1 to never retry - remove_delay -- How long after a Task finishes to remove it from the scheduler - state_path -- Path to state file (tasks and active workers) - worker_disconnect_delay -- If a worker hasn't communicated for this long, remove it from active workers - ''' + :param retry_delay: how long after a Task fails to try it again, or -1 to never retry. + :param remove_delay: how long after a Task finishes to remove it from the scheduler. + :param state_path: path to state file (tasks and active workers). + :param worker_disconnect_delay: if a worker hasn't communicated for this long, remove it from active workers. + """ self._config = SchedulerConfig( retry_delay=retry_delay, remove_delay=remove_delay, @@ -358,11 +407,11 @@ def __init__(self, retry_delay=900.0, remove_delay=600.0, worker_disconnect_dela disable_failures=disable_failures, disable_window=disable_window, disable_persist=disable_persist, - disable_time=disable_persist) + disable_time=disable_persist, + max_shown_tasks=max_shown_tasks, + ) - self._task_history = task_history or history.NopHistory() self._state = SimpleTaskState(state_path) - self._task_history = task_history or history.NopHistory() self._resources = resources self._make_task = functools.partial( @@ -387,7 +436,7 @@ def prune(self): remove_tasks = [] for task in self._state.get_active_tasks(): - if task.prune(self._config): + if self._state.prune(task, self._config): remove_tasks.append(task.id) self._state.inactivate_tasks(remove_tasks) @@ -395,15 +444,18 @@ def prune(self): logger.info("Done pruning task graph") def update(self, worker_id, worker_reference=None): - """ Keep track of whenever the worker was last active """ + """ + Keep track of whenever the worker was last active. + """ worker = self._state.get_worker(worker_id) worker.update(worker_reference) def _update_priority(self, task, prio, worker): - """ Update priority of the given task + """ + Update priority of the given task. - Priority can only be increased. If the task doesn't exist, a placeholder - task is created to preserve priority when the task is later scheduled. + Priority can only be increased. + If the task doesn't exist, a placeholder task is created to preserve priority when the task is later scheduled. """ task.priority = prio = max(prio, task.priority) for dep in task.deps or []: @@ -413,25 +465,25 @@ def _update_priority(self, task, prio, worker): def add_task(self, worker, task_id, status=PENDING, runnable=True, deps=None, new_deps=None, expl=None, resources=None, - priority=0, family='', params={}): + priority=0, family='', params=None, **kwargs): """ - * Add task identified by task_id if it doesn't exist - * If deps is not None, update dependency list - * Update status of task - * Add additional workers/stakeholders - * Update priority when needed + * add task identified by task_id if it doesn't exist + * if deps is not None, update dependency list + * update status of task + * add additional workers/stakeholders + * update priority when needed """ self.update(worker) task = self._state.get_task(task_id, setdefault=self._make_task( - id=task_id, status=PENDING, deps=deps, resources=resources, - priority=priority, family=family, params=params)) + task_id=task_id, status=PENDING, deps=deps, resources=resources, + priority=priority, family=family, params=params)) # for setting priority, we'll sometimes create tasks with unset family and params if not task.family: task.family = family if not task.params: - task.params = params + task.params = _get_default(params, {}) if task.remove is not None: task.remove = None # unmark task for removal so it isn't removed after being added @@ -443,7 +495,7 @@ def add_task(self, worker, task_id, status=PENDING, runnable=True, # We also check for status == PENDING b/c that's the default value # (so checking for status != task.status woule lie) self._update_task_history(task_id, status) - task.set_status(PENDING if status == SUSPENDED else status, self._config) + self._state.set_status(task, PENDING if status == SUSPENDED else status, self._config) if status == FAILED: task.retry = time.time() + self._config.retry_delay @@ -459,7 +511,7 @@ def add_task(self, worker, task_id, status=PENDING, runnable=True, # Task dependencies might not exist yet. Let's create dummy tasks for them for now. # Otherwise the task dependencies might end up being pruned if scheduling takes a long time for dep in task.deps or []: - t = self._state.get_task(dep, setdefault=self._make_task(id=dep, status=UNKNOWN, deps=None, priority=priority)) + t = self._state.get_task(dep, setdefault=self._make_task(task_id=dep, status=UNKNOWN, deps=None, priority=priority)) t.stakeholders.add(worker) self._update_priority(task, priority, worker) @@ -470,7 +522,7 @@ def add_task(self, worker, task_id, status=PENDING, runnable=True, if expl is not None: task.expl = expl - def add_worker(self, worker, info): + def add_worker(self, worker, info, **kwargs): self._state.get_worker(worker).add_info(info) def update_resources(self, **resources): @@ -483,7 +535,7 @@ def _has_resources(self, needed_resources, used_resources): return True available_resources = self._resources or {} - for resource, amount in needed_resources.items(): + for resource, amount in needed_resources.iteritems(): if amount + used_resources[resource] > available_resources.get(resource, 1): return False return True @@ -493,17 +545,22 @@ def _used_resources(self): if self._resources is not None: for task in self._state.get_active_tasks(): if task.status == RUNNING and task.resources: - for resource, amount in task.resources.items(): + for resource, amount in task.resources.iteritems(): used_resources[resource] += amount return used_resources def _rank(self): - ''' Return worker's rank function for task scheduling ''' + """ + Return worker's rank function for task scheduling. + + :return: + """ dependents = collections.defaultdict(int) + def not_done(t): task = self._state.get_task(t, default=None) return task is None or task.status != DONE - for task in self._state.get_active_tasks(): + for task in self._state.get_pending_tasks(): if task.status != DONE: deps = filter(not_done, task.deps) inverse_num_deps = 1.0 / max(len(deps), 1) @@ -521,7 +578,7 @@ def _schedulable(self, task): return False return True - def get_work(self, worker, host=None): + def get_work(self, worker, host=None, **kwargs): # TODO: remove any expired nodes # Algo: iterate over all nodes, find the highest priority node no dependencies and available @@ -568,7 +625,7 @@ def get_work(self, worker, host=None): if task.status == RUNNING and task.worker_running in greedy_workers: greedy_workers[task.worker_running] -= 1 - for resource, amount in (task.resources or {}).items(): + for resource, amount in (task.resources or {}).iteritems(): greedy_resources[resource] += amount if not best_task and self._schedulable(task) and self._has_resources(task.resources, greedy_resources): @@ -582,13 +639,13 @@ def get_work(self, worker, host=None): greedy_workers[task_worker] -= 1 # keep track of the resources used in greedy scheduling - for resource, amount in (task.resources or {}).items(): + for resource, amount in (task.resources or {}).iteritems(): greedy_resources[resource] += amount break if best_task: - best_task.status = RUNNING + self._state.set_status(best_task, RUNNING, self._config) best_task.worker_running = worker best_task.time_running = time.time() self._update_task_history(best_task.id, RUNNING, host=host) @@ -598,7 +655,7 @@ def get_work(self, worker, host=None): 'task_id': best_task_id, 'running_tasks': running_tasks} - def ping(self, worker): + def ping(self, worker, **kwargs): self.update(worker) def _upstream_status(self, task_id, upstream_status_table): @@ -621,7 +678,7 @@ def _upstream_status(self, task_id, upstream_status_table): elif upstream_status_table[dep_id] == '' and dep.deps: # This is the postorder update step when we set the # status based on the previously calculated child elements - upstream_status = [upstream_status_table.get(id, '') for id in dep.deps] + upstream_status = [upstream_status_table.get(task_id, '') for task_id in dep.deps] upstream_status.append('') # to handle empty list status = max(upstream_status, key=UPSTREAM_SEVERITY_KEY) upstream_status_table[dep_id] = status @@ -645,7 +702,7 @@ def _serialize_task(self, task_id, include_deps=True): ret['deps'] = list(task.deps) return ret - def graph(self): + def graph(self, **kwargs): self.prune() serialized = {} for task in self._state.get_active_tasks(): @@ -678,27 +735,30 @@ def _recurse_deps(self, task_id, serialized): for dep in task.deps: self._recurse_deps(dep, serialized) - def dep_graph(self, task_id): + def dep_graph(self, task_id, **kwargs): self.prune() serialized = {} if self._state.has_task(task_id): self._recurse_deps(task_id, serialized) return serialized - def task_list(self, status, upstream_status): - ''' query for a subset of tasks by status ''' + def task_list(self, status, upstream_status, limit=True, **kwargs): + """ + Query for a subset of tasks by status. + """ self.prune() result = {} upstream_status_table = {} # used to memoize upstream status - for task in self._state.get_active_tasks(): - if not status or task.status == status: - if (task.status != PENDING or not upstream_status or + for task in self._state.get_active_tasks(status): + if (task.status != PENDING or not upstream_status or upstream_status == self._upstream_status(task.id, upstream_status_table)): - serialized = self._serialize_task(task.id, False) - result[task.id] = serialized + serialized = self._serialize_task(task.id, False) + result[task.id] = serialized + if limit and len(result) > self._config.max_shown_tasks: + return {'num_tasks': len(result)} return result - def worker_list(self, include_running=True): + def worker_list(self, include_running=True, **kwargs): self.prune() workers = [ dict( @@ -728,7 +788,7 @@ def worker_list(self, include_running=True): worker['running'] = tasks return workers - def inverse_dependencies(self, task_id): + def inverse_dependencies(self, task_id, **kwargs): self.prune() serialized = {} if self._state.has_task(task_id): @@ -748,8 +808,13 @@ def _traverse_inverse_deps(self, task_id, serialized): serialized[task.id]["deps"] = [] stack.append(task.id) - def task_search(self, task_str): - ''' query for a subset of tasks by task_id ''' + def task_search(self, task_str, **kwargs): + """ + Query for a subset of tasks by task_id. + + :param task_str: + :return: + """ self.prune() result = collections.defaultdict(dict) for task in self._state.get_active_tasks(): @@ -762,11 +827,11 @@ def re_enable_task(self, task_id): serialized = {} task = self._state.get_task(task_id) if task and task.status == DISABLED and task.scheduler_disable_time: - task.re_enable() + self._state.re_enable(task, self._config) serialized = self._serialize_task(task_id) return serialized - def fetch_error(self, task_id): + def fetch_error(self, task_id, **kwargs): if self._state.has_task(task_id): return {"taskId": task_id, "error": self._state.get_task(task_id).expl} else: diff --git a/luigi/server.py b/luigi/server.py index cddbfa71cd..430737deda 100644 --- a/luigi/server.py +++ b/luigi/server.py @@ -13,23 +13,25 @@ # the License. # Simple REST server that takes commands in a JSON payload -import json -import os import atexit +import json +import logging import mimetypes +import os import posixpath +import signal + +import pkg_resources +import tornado.httpclient +import tornado.httpserver import tornado.ioloop import tornado.netutil import tornado.web -import tornado.httpclient -import tornado.httpserver + import configuration import scheduler -import pkg_resources -import signal -from rpc import RemoteSchedulerResponder import task_history -import logging + logger = logging.getLogger("luigi.server") @@ -45,6 +47,7 @@ def _create_scheduler(): disable_window = config.getint('scheduler', 'disable-window-seconds', 3600) disable_failures = config.getint('scheduler', 'disable-num-failures', None) disable_persist = config.getint('scheduler', 'disable-persist-seconds', 86400) + max_shown_tasks = config.getint('scheduler', 'max-shown-tasks', 100000) resources = config.getintdict('resources') if config.getboolean('scheduler', 'record_task_history', False): @@ -54,21 +57,28 @@ def _create_scheduler(): task_history_impl = task_history.NopHistory() return scheduler.CentralPlannerScheduler( retry_delay, remove_delay, worker_disconnect_delay, state_path, task_history_impl, - resources, disable_persist, disable_window, disable_failures) + resources, disable_persist, disable_window, disable_failures, max_shown_tasks, + ) class RPCHandler(tornado.web.RequestHandler): - """ Handle remote scheduling calls using rpc.RemoteSchedulerResponder""" + """ + Handle remote scheduling calls using rpc.RemoteSchedulerResponder. + """ - def initialize(self, api): - self._api = api + def initialize(self, scheduler): + self._scheduler = scheduler def get(self, method): payload = self.get_argument('data', default="{}") arguments = json.loads(payload) - if hasattr(self._api, method): - result = getattr(self._api, method)(**arguments) + # TODO: we should probably denote all methods on the scheduler that are "API-level" + # versus internal methods. Right now you can do a REST method call to any method + # defined on the scheduler, which is pretty bad from a security point of view. + + if hasattr(self._scheduler, method): + result = getattr(self._scheduler, method)(**arguments) self.write({"response": result}) # wrap all json response in a dictionary else: self.send_error(404) @@ -77,40 +87,46 @@ def get(self, method): class BaseTaskHistoryHandler(tornado.web.RequestHandler): - def initialize(self, api): - self._api = api + + def initialize(self, scheduler): + self._scheduler = scheduler def get_template_path(self): return pkg_resources.resource_filename(__name__, 'templates') class RecentRunHandler(BaseTaskHistoryHandler): + def get(self): - tasks = self._api.task_history.find_latest_runs() + tasks = self._scheduler.task_history.find_latest_runs() self.render("recent.html", tasks=tasks) class ByNameHandler(BaseTaskHistoryHandler): + def get(self, name): - tasks = self._api.task_history.find_all_by_name(name) + tasks = self._scheduler.task_history.find_all_by_name(name) self.render("recent.html", tasks=tasks) class ByIdHandler(BaseTaskHistoryHandler): + def get(self, id): - task = self._api.task_history.find_task_by_id(id) + task = self._scheduler.task_history.find_task_by_id(id) self.render("show.html", task=task) class ByParamsHandler(BaseTaskHistoryHandler): + def get(self, name): payload = self.get_argument('data', default="{}") arguments = json.loads(payload) - tasks = self._api.task_history.find_all_by_parameters(name, session=None, **arguments) + tasks = self._scheduler.task_history.find_all_by_parameters(name, session=None, **arguments) self.render("recent.html", tasks=tasks) class StaticFileHandler(tornado.web.RequestHandler): + def get(self, path): # Path checking taken from Flask's safe_join function: # https://github.com/mitsuhiko/flask/blob/1d55b8983/flask/helpers.py#L563-L587 @@ -126,27 +142,29 @@ def get(self, path): class RootPathHandler(tornado.web.RequestHandler): + def get(self): self.redirect("/static/visualiser/index.html") -def app(api): +def app(scheduler): handlers = [ - (r'/api/(.*)', RPCHandler, {"api": api}), + (r'/api/(.*)', RPCHandler, {"scheduler": scheduler}), (r'/static/(.*)', StaticFileHandler), (r'/', RootPathHandler), - (r'/history', RecentRunHandler, {'api': api}), - (r'/history/by_name/(.*?)', ByNameHandler, {'api': api}), - (r'/history/by_id/(.*?)', ByIdHandler, {'api': api}), - (r'/history/by_params/(.*?)', ByParamsHandler, {'api': api}) + (r'/history', RecentRunHandler, {'scheduler': scheduler}), + (r'/history/by_name/(.*?)', ByNameHandler, {'scheduler': scheduler}), + (r'/history/by_id/(.*?)', ByIdHandler, {'scheduler': scheduler}), + (r'/history/by_params/(.*?)', ByParamsHandler, {'scheduler': scheduler}) ] api_app = tornado.web.Application(handlers) return api_app -def _init_api(sched, responder, api_port, address): - api = responder or RemoteSchedulerResponder(sched) - api_app = app(api) +def _init_api(sched, responder=None, api_port=None, address=None): + if responder: + raise Exception('The "responder" argument is no longer supported') + api_app = app(sched) api_sockets = tornado.netutil.bind_sockets(api_port, address=address) server = tornado.httpserver.HTTPServer(api_app) server.add_sockets(api_sockets) @@ -156,7 +174,9 @@ def _init_api(sched, responder, api_port, address): def run(api_port=8082, address=None, scheduler=None, responder=None): - """ Runs one instance of the API server """ + """ + Runs one instance of the API server. + """ sched = scheduler or _create_scheduler() # load scheduler state sched.load() @@ -175,9 +195,9 @@ def shutdown_handler(foo=None, bar=None): signal.signal(signal.SIGINT, shutdown_handler) signal.signal(signal.SIGTERM, shutdown_handler) if os.name == 'nt': - signal.signal(signal.SIGBREAK, shutdown_handler) + signal.signal(signal.SIGBREAK, shutdown_handler) else: - signal.signal(signal.SIGQUIT, shutdown_handler) + signal.signal(signal.SIGQUIT, shutdown_handler) atexit.register(shutdown_handler) logger.info("Scheduler starting up") @@ -186,10 +206,17 @@ def shutdown_handler(foo=None, bar=None): def run_api_threaded(api_port=8082, address=None): - ''' For integration tests''' + """ + For integration tests. + + :param api_port: + :param address: + :return: + """ sock_names = _init_api(_create_scheduler(), None, api_port, address) import threading + def scheduler_thread(): # this is wrapped in a function so we get the instance # from the scheduler thread and not from the main thread diff --git a/luigi/static/visualiser/index.html b/luigi/static/visualiser/index.html index f479865c6a..151ac630b5 100644 --- a/luigi/static/visualiser/index.html +++ b/luigi/static/visualiser/index.html @@ -82,6 +82,9 @@

{{/tasks}} +