Skip to content

Commit

Permalink
[ADAM-1847] Update ADAM scripts to support self-contained pip install.
Browse files Browse the repository at this point in the history
Resolves bigdatagenomics#1847. Cribs heavily from PySpark's script flow for supporting a full,
self-contained pip install-able Spark by finding the JARs and bin scripts and
packaging them up as packages which are deployed to pip. We then needed to
modify the bin scripts to find the pip installed JARs.
  • Loading branch information
fnothaft committed Dec 30, 2017
1 parent e1ad0e3 commit 90a8b44
Show file tree
Hide file tree
Showing 6 changed files with 221 additions and 51 deletions.
77 changes: 77 additions & 0 deletions adam-python/bdgenomics/adam/find_adam_home.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#!/usr/bin/env python

#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This script is copied from Apache Spark, with minor modifications.
#
# This script attempt to determine the correct setting for ADAM_HOME given
# that ADAM may have been installed on the system with pip.

from __future__ import print_function
import os
import sys


def _find_adam_home():
"""Find the ADAM_HOME."""
# If the enviroment has ADAM_HOME set trust it.
if "ADAM_HOME" in os.environ:
return os.environ["ADAM_HOME"]

def is_adam_home(path):
"""Takes a path and returns true if the provided path could be a reasonable ADAM_HOME"""
return (os.path.isfile(os.path.join(path, "bin/adam-submit")) and
(os.path.isdir(os.path.join(path, "jars")) or
os.path.isdir(os.path.join(path, "assembly"))))

paths = ["../", os.path.dirname(os.path.realpath(__file__))]

# Add the path of the ADAM module if it exists
if sys.version < "3":
import imp
try:
module_home = imp.find_module("bdgenomics.adam")[1]
paths.append(module_home)
# If we are installed in edit mode also look two dirs up
paths.append(os.path.join(module_home, "../../"))
except ImportError:
# Not pip installed no worries
pass
else:
from importlib.util import find_spec
try:
module_home = os.path.dirname(find_spec("bdgenomics.adam").origin)
paths.append(module_home)
# If we are installed in edit mode also look two dirs up
paths.append(os.path.join(module_home, "../../"))
except ImportError:
# Not pip installed no worries
pass

# Normalize the paths
paths = [os.path.abspath(p) for p in paths]

try:
return next(path for path in paths if is_spark_home(path))
except StopIteration:
print("Could not find valid ADAM_HOME while searching {0}".format(paths), file=sys.stderr)
exit(-1)

if __name__ == "__main__":
print(_find_adam_home())
102 changes: 93 additions & 9 deletions adam-python/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,99 @@
# limitations under the License.
#

import os
import sys
from setuptools import find_packages, setup

from version import version as adam_version

setup(
name='bdgenomics.adam',
version=adam_version,
description='A fast, scalable genome analysis system',
author='Frank Austin Nothaft',
author_email='fnothaft@berkeley.edu',
url="https://github.com/bdgenomics/adam",
install_requires=[],
packages=find_packages(exclude=['*.test.*']))
if sys.version_info < (2, 7):
print("Python versions prior to 2.7 are not supported for pip installed PySpark.",
file=sys.stderr)
exit(-1)

# Provide guidance about how to use setup.py
incorrect_invocation_message = """
If you are installing PyADAM from ADAM's source, you must first build ADAM and
run sdist.
To build Spark with maven you can run:
./build/mvn -DskipTests clean package
Building the source dist is done in the Python directory:
cd python
python setup.py sdist
pip install dist/*.tar.gz"""

# Figure out where the jars are we need to package
ADAM_HOME = os.path.abspath("../")
TEMP_PATH = "deps"

JARS_PATH = glob.glob(os.path.join(ADAM_HOME, "adam-assembly/target/scala-*/jars/"))
JARS_TARGET = os.path.join(TEMP_PATH, "jars")

SCRIPTS_PATH = os.path.join(ADAM_HOME, "bin")
SCRIPTS_TARGET = os.path.join(TEMP_PATH, "bin")

if len(JARS_PATH) == 1:
JARS_PATH = JARS_PATH[0]
elif len(JARS_PATH) > 1:
print("Assembly jars exist for multiple scalas ({0}), please cleanup assembly/target".format(
JARS_PATH), file=sys.stderr)
sys.exit(-1)
elif len(JARS_PATH) == 0 and not os.path.exists(TEMP_PATH):
print(incorrect_invocation_message, file=sys.stderr)
sys.exit(-1)

try:
os.mkdir(TEMP_PATH)
except:
print("Temp path for symlink to parent already exists {0}".format(TEMP_PATH),
file=sys.stderr)
exit(-1)

try:
os.symlink(JARS_PATH, JARS_TARGET)
os.symlink(SCRIPTS_PATH, SCRIPTS_TARGET)

packages = find_packages(exclude=['*.test.*'])
packages.append('bdgenomics.adam.jars')

# Scripts directive requires a list of each script path and does not take wild cards.
script_names = os.listdir(SCRIPTS_TARGET)
scripts = list(map(lambda script: os.path.join(SCRIPTS_TARGET, script), script_names))

# We add find_adam_home.py to the bin directory we install so that pip installed PySpark
# will search for ADAM_HOME with Python.
scripts.append("bdgenomics/adam/find_adam_home.py")


setup(
name='bdgenomics.adam',
version=adam_version,
description='A fast, scalable genome analysis system',
author='Frank Austin Nothaft',
author_email='fnothaft@berkeley.edu',
url="https://github.com/bdgenomics/adam",
scripts=scripts,
install_requires=[],
packages=packages,
include_package_data=True,
install_requires=['pyspark>=1.6.0'],
package_dir={'bdgenomics.adam.jars': 'deps/jars'},
package_data={'bdgenomics.adam.jars': 'deps/jars'},
classifiers=[
'Development Status :: 4 - Beta',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: Implementation :: CPython',
'Programming Language :: Python :: Implementation :: PyPy',
'Topic :: Scientific/Engineering :: Bio-Informatics'])

finally:
os.remove(os.path.join(TEMP_PATH, "jars"))
os.remove(os.path.join(TEMP_PATH, "bin"))
os.rmdir(TEMP_PATH)
9 changes: 4 additions & 5 deletions bin/find-adam-assembly.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,14 @@
set -e

SOURCE_DIR=$(dirname ${BASH_SOURCE[0]})
SCRIPT_DIR=$(${SOURCE_DIR}/find-script-dir.sh)
INSTALL_DIR=$(dirname $SCRIPT_DIR)
. ${SOURCE_DIR}/find-adam-home

# Find ADAM cli assembly jar
ADAM_CLI_JAR=
if [ -d "$INSTALL_DIR/repo" ]; then
ASSEMBLY_DIR="$INSTALL_DIR/repo"
if [ -d "$ADAM_HOME/repo" ]; then
ASSEMBLY_DIR="$ADAM_HOME/repo"
else
ASSEMBLY_DIR="$INSTALL_DIR/adam-assembly/target"
ASSEMBLY_DIR="$ADAM_HOME/adam-assembly/target"
fi

ASSEMBLY_JARS=$(ls -1 "$ASSEMBLY_DIR" | grep "^adam[0-9A-Za-z\.\_\-]*\.jar$" | grep -v javadoc | grep -v sources || true)
Expand Down
9 changes: 4 additions & 5 deletions bin/find-adam-egg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,13 @@
set -e

SOURCE_DIR=$(dirname ${BASH_SOURCE[0]})
SCRIPT_DIR=$(${SOURCE_DIR}/find-script-dir.sh)
INSTALL_DIR=$(dirname $SCRIPT_DIR)
. ${SOURCE_DIR}/find-adam-home

# Find ADAM python egg
if [ -d "$INSTALL_DIR/repo" ]; then
DIST_DIR="$INSTALL_DIR/repo"
if [ -d "$ADAM_HOME/repo" ]; then
DIST_DIR="$ADAM_HOME/repo"
else
DIST_DIR="$INSTALL_DIR/adam-python/dist"
DIST_DIR="$ADAM_HOME/adam-python/dist"
fi

DIST_EGG=$(ls -1 "$DIST_DIR" | grep "^bdgenomics\.adam[0-9A-Za-z\.\_\-]*.egg$" || true)
Expand Down
43 changes: 43 additions & 0 deletions bin/find-adam-home
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/usr/bin/env bash
#
# Licensed to Big Data Genomics (BDG) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The BDG licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

set -e

# Attempts to find a proper value for ADAM_HOME. Should be included using "source" directive.

FIND_ADAM_HOME_PYTHON_SCRIPT="$(cd "$(dirname "$0")"; pwd)/find_adam_home.py"

# Short cirtuit if the user already has this set.
if [ ! -z "${ADAM_HOME}" ]; then
exit 0
elif [ ! -f "$FIND_ADAM_HOME_PYTHON_SCRIPT" ]; then
# If we are not in the same directory as find_adam_home.py we are not pip installed so we don't
# need to search the different Python directories for a Adam installation.
# Note only that, if the user has pip installed adam but is directly calling pyadam or
# adam-submit in another directory we want to use that version of adam rather than the
# pip installed version of adam.
export ADAM_HOME="$(cd "$(dirname "$0")"/..; pwd)"
else
# We are pip installed, use the Python script to resolve a reasonable ADAM_HOME
# Default to standard python interpreter unless told otherwise
if [[ -z "$PYSPARK_DRIVER_PYTHON" ]]; then
PYSPARK_DRIVER_PYTHON="${PYSPARK_PYTHON:-"python"}"
fi
export ADAM_HOME=$($PYSPARK_DRIVER_PYTHON "$FIND_ADAM_HOME_PYTHON_SCRIPT")
fi
32 changes: 0 additions & 32 deletions bin/find-script-dir.sh

This file was deleted.

0 comments on commit 90a8b44

Please sign in to comment.