Skip to content

Commit

Permalink
[Criteo] Update build with Criteo specificities (#131)
Browse files Browse the repository at this point in the history
* [CRITEO] Update build
  * Update docker image to match the one used for 2.x builds
  * Add build scripts to be used with the jenkins build
  * Downgrade maven-deploy-plugin to version 2.8.1 (same as branch 2.X) otherwise uploading to nexus fails with 401
  * Add python requirements.txt

* Improve build

  * Create a pex with hadoop dependencies
  * clearly mention hadoop in MVN artefacts
  * Fix python versions according to https://peps.python.org/pep-0440/#local-version-identifiers
  * Update sed command to change python version (spark 3.3 uses python typing)

* Add timestamp as parameter in build_script.sh (#123)

It will allow us to have the same timestamp for all spark/scala versions

* Bump Maven

Maven 3.6.3 is not yes available on apache maven side

* Fix dependencies for spark jars

The jar:jar plugin prevent the whole dependency resolution to be done.
The deploy goal will perform the whole process, including the jar
deployment.

* Enhance spark jars with cleaner dependencies

Including:

 - hive-provided
 - Build spark hadoop-provided with Hadoop criteo version

---------

Co-authored-by: w.montaz <w.montaz@criteo.com>
Co-authored-by: Anthony RABIER <anthony.rabier@gmail.com>
Co-authored-by: Anthony Rabier <a.rabier@criteo.com>
  • Loading branch information
4 people authored and kt.maudoux committed May 31, 2024
1 parent 6306a89 commit d1cd237
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 9 deletions.
17 changes: 17 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

36 changes: 27 additions & 9 deletions external/docker/criteo-build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,17 @@

FROM filer-docker-registry.crto.in/criteo-centos-base:0.1.0-3325-gadc8d652

ARG USER_NAME
ARG USER_ID
ARG GROUP_ID

WORKDIR /root

SHELL ["/bin/bash", "-o", "pipefail", "-c"]

# Without the following plugin yum fails to install packages with
# Rpmdb checksum is invalid: dCDPT(pkg checksums)....
RUN yum -y install yum-plugin-ovl
RUN yum groupinstall -y 'Development Tools'
RUN yum install -y \
ant \
Expand Down Expand Up @@ -50,23 +57,20 @@ RUN yum install -y \
java-1.8.0-openjdk \
pinentry-curses \
pkgconfig \
python3 \
python3-pip \
python3-setuptools \
python3-wheel \
python3-virtualenv \
rsync \
ShellCheck \
sudo \
valgrind \
wget \
zlib-devel

# Install maven 3 from source
RUN rm -rf /usr/share/maven \
&& wget --no-check-certificate https://downloads.apache.org/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz \
&& tar xf apache-maven-3.5.4-bin.tar.gz -C /usr/share \
&& ln -s /usr/share/apache-maven-3.5.4 /usr/share/maven

# we need maven 3.8.8
RUN rm -rf /usr/share/maven \
&& wget --no-check-certificate https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz \
&& tar xf apache-maven-3.8.8-bin.tar.gz -C /usr/share \
&& ln -s /usr/share/apache-maven-3.8.8 /usr/share/maven

######
# Set env vars required to build Hadoop
Expand All @@ -79,3 +83,17 @@ ENV JAVA_HOME /etc/alternatives/java_sdk
# Avoid out of memory errors in builds
###
ENV MAVEN_OPTS -Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g


RUN groupadd --non-unique -g ${GROUP_ID} ${USER_NAME}
RUN useradd -l -g ${GROUP_ID} -u ${USER_ID} -k /root -m ${USER_NAME}
RUN echo "${USER_NAME} ALL=NOPASSWD: ALL" > "/etc/sudoers.d/spark-build-${USER_ID}"
ENV HOME /home/${USER_NAME}
RUN mkdir /home/${USER_NAME}/.m2 && chown ${USER_NAME}: /home/${USER_NAME}/.m2
RUN echo '<settings><mirrors><mirror><id>criteo</id><mirrorOf>*</mirrorOf><url>http://nexus.criteo.prod/content/groups/criteodev</url></mirror></mirrors><servers><server><id>criteo</id><username>${criteo.repo.username}</username><password>${criteo.repo.password}</password></server></servers></settings>' > /home/${USER_NAME}/.m2/settings.xml

# Alias python3 to python otherwise python 2 is called
RUN mv /usr/bin/python /usr/bin/python2
RUN ln -s /usr/bin/python3 /usr/bin/python

RUN rm -f /var/log/faillog /var/log/lastlog
2 changes: 2 additions & 0 deletions external/docker/criteo-build/build_config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
HDP_VERSION=3.3.0-criteo-20230320100819
HIVE_VERSION=2.3.9
137 changes: 137 additions & 0 deletions external/docker/criteo-build/build_script.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
set -x
set -e

MAVEN_USER=$1
MAVEN_PASSWORD=$2
SCALA_RELEASE=$3
SPARK_RELEASE=$4
NEXUS_ARTIFACT_URL=$5
NEXUS_PYPY_URL=$6
TIMESTAMP=$7

for var in "$MAVEN_USER" "$MAVEN_PASSWORD" "$SCALA_RELEASE" "$SPARK_RELEASE" "$NEXUS_ARTIFACT_URL" "$NEXUS_PYPY_URL" "$TIMESTAMP"; do
if [ -z "$var" ]; then
echo "Missing arguments"
exit 1
fi
done

TWINE_USERNAME=$MAVEN_USER
TWINE_PASSWORD=$MAVEN_PASSWORD

# Load HDP_VERSION and HIVE_VERSION
source external/docker/criteo-build/build_config.sh

deploy_python()
{
pyspark_version=$1
sed -i "s/__version__: str = \\\".*\\\"/__version__: str = \\\"${pyspark_version}\\\"/g" python/pyspark/version.py
python -m venv venv
source venv/bin/activate
pip install --upgrade pip
pip install -r python/requirements.txt
cd python
python setup.py bdist_wheel
twine upload dist/pyspark*whl -u ${TWINE_USERNAME} -p ${TWINE_PASSWORD} --skip-existing --repository-url "${NEXUS_PYPY_URL}/"
python setup.py clean --all
cd $OLDPWD
}

VERSION_SUFFIX="criteo-${TIMESTAMP}"

if [ ${SCALA_RELEASE} == "2.12" ]; then
./dev/change-scala-version.sh 2.12
MVN_SCALA_PROPERTY="-Pscala-2.12"
elif [ ${SCALA_RELEASE} == "2.11" ]; then
./dev/change-scala-version.sh 2.11
MVN_SCALA_PROPERTY="-Pscala-2.11"
else
echo "[ERROR] Scala release not provided"
exit 1
fi

SPARK_VERSION="$(mvn org.apache.maven.plugins:maven-help-plugin:evaluate -Dexpression=project.version -q -DforceStdout)"
CRITEO_VERSION="${SPARK_VERSION}-${VERSION_SUFFIX}"
SPARK_ARTIFACT_FILE="spark-${CRITEO_VERSION}-bin-${SCALA_RELEASE}.tgz"
SPARK_HDP_ARTIFACT_FILE="spark-${CRITEO_VERSION}-bin-${SCALA_RELEASE}-${HDP_VERSION}.tgz"
SPARK_JARS_ARTIFACT_FILE="spark-${CRITEO_VERSION}-jars-${SCALA_RELEASE}.tgz"
MVN_ARTIFACT_VERSION="${CRITEO_VERSION}-${SCALA_RELEASE}"
MVN_HDP_ARTIFACT_VERSION="${MVN_ARTIFACT_VERSION}-hadoop-${HDP_VERSION}"
PYTHON_PEX_VERSION="${SPARK_RELEASE}+criteo.scala.${SCALA_RELEASE}.${TIMESTAMP}"
PYTHON_HDP_PEX_VERSION="${SPARK_RELEASE}+criteo.scala.${SCALA_RELEASE}.hadoop.${HDP_VERSION}.${TIMESTAMP}"
SHUFFLE_SERVICE_JAR_FILE="dist/yarn/spark-${CRITEO_VERSION}-yarn-shuffle.jar"
MVN_COMMON_PROPERTIES="-Phive-provided -Phive-thriftserver -Pyarn -Dhive.version=${HIVE_VERSION} -Dhadoop.version=${HDP_VERSION} ${MVN_SCALA_PROPERTY}"
MVN_COMMON_DEPLOY_FILE_PROPERTIES="-Durl=${NEXUS_ARTIFACT_URL} -DrepositoryId=criteo -Dcriteo.repo.username=${MAVEN_USER} -Dcriteo.repo.password=${MAVEN_PASSWORD} -DretryFailedDeploymentCount=3"

# do some house cleaning
mvn --no-transfer-progress clean
rm -f spark-*.tgz
rm -f dist/python/dist/*
rm -f python/dist/*

# change version
mvn --no-transfer-progress versions:set -DnewVersion=${CRITEO_VERSION}

# Build distribution with hadoop
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE}-${HDP_VERSION} --tgz -ntp ${MVN_COMMON_PROPERTIES}

# tgz artifact deployment
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark \
-Dversion=${MVN_HDP_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_HDP_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

deploy_python $PYTHON_HDP_PEX_VERSION

# Build distribution without hadoop
./dev/make-distribution.sh --pip --name ${SCALA_RELEASE} --tgz -ntp ${MVN_COMMON_PROPERTIES} -Phadoop-provided
# tgz artifact deployment
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark \
-Dversion=${MVN_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# Create archive with jars only
cd dist/jars && tar -czf ${OLDPWD}/${SPARK_JARS_ARTIFACT_FILE} *.jar; cd $OLDPWD

# Deploy tgz jars only artifact
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=com.criteo.tarballs \
-DartifactId=spark-jars \
-Dversion=${MVN_ARTIFACT_VERSION} \
-Dpackaging=tar.gz \
-Dfile=${SPARK_JARS_ARTIFACT_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# shuffle service deployment
mvn deploy:deploy-file \
--batch-mode \
-DgroupId=org.apache.spark \
-DartifactId=yarn-shuffle_${SCALA_RELEASE} \
-Dversion=${CRITEO_VERSION} \
-Dpackaging=jar \
-Dfile=${SHUFFLE_SERVICE_JAR_FILE} \
${MVN_COMMON_DEPLOY_FILE_PROPERTIES}

# jar artifacts (for parent poms) deployment
mvn deploy \
--batch-mode \
${MVN_COMMON_PROPERTIES} \
-Phadoop-provided \
-DaltDeploymentRepository=criteo::default::${NEXUS_ARTIFACT_URL} \
-Dcriteo.repo.username=${MAVEN_USER} \
-Dcriteo.repo.password=${MAVEN_PASSWORD} \
-DskipTests


# python deployment
deploy_python $PYTHON_PEX_VERSION
8 changes: 8 additions & 0 deletions python/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
wheel
numpy
pandas
pypandoc==1.5
py4j==0.10.7
pyarrow
twine
cryptography==3.3.1

0 comments on commit d1cd237

Please sign in to comment.