From 24891714d38a50d9466aeb13d54df0a32fcc5d1c Mon Sep 17 00:00:00 2001 From: jetoile Date: Fri, 16 Feb 2024 11:38:47 +0100 Subject: [PATCH] [Criteo] Update build with Criteo specificities (#131) * [CRITEO] Update build * Update docker image to match the one used for 2.x builds * Add build scripts to be used with the jenkins build * Downgrade maven-deploy-plugin to version 2.8.1 (same as branch 2.X) otherwise uploading to nexus fails with 401 * Add python requirements.txt * Improve build * Create a pex with hadoop dependencies * clearly mention hadoop in MVN artefacts * Fix python versions according to https://peps.python.org/pep-0440/#local-version-identifiers * Update sed command to change python version (spark 3.3 uses python typing) * Add timestamp as parameter in build_script.sh (#123) It will allow us to have the same timestamp for all spark/scala versions * Bump Maven Maven 3.6.3 is not yes available on apache maven side * Fix dependencies for spark jars The jar:jar plugin prevent the whole dependency resolution to be done. The deploy goal will perform the whole process, including the jar deployment. * Enhance spark jars with cleaner dependencies Including: - hive-provided - Build spark hadoop-provided with Hadoop criteo version --------- Co-authored-by: w.montaz Co-authored-by: Anthony RABIER Co-authored-by: Anthony Rabier --- .idea/vcs.xml | 17 +++ external/docker/criteo-build/Dockerfile | 36 +++-- external/docker/criteo-build/build_config.sh | 2 + external/docker/criteo-build/build_script.sh | 137 +++++++++++++++++++ python/requirements.txt | 8 ++ 5 files changed, 191 insertions(+), 9 deletions(-) create mode 100644 .idea/vcs.xml create mode 100644 external/docker/criteo-build/build_config.sh create mode 100644 external/docker/criteo-build/build_script.sh create mode 100644 python/requirements.txt diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000000..bdf0cdcbff5db --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,17 @@ + + + + + + \ No newline at end of file diff --git a/external/docker/criteo-build/Dockerfile b/external/docker/criteo-build/Dockerfile index 834529bc49bde..94348763b3f1b 100644 --- a/external/docker/criteo-build/Dockerfile +++ b/external/docker/criteo-build/Dockerfile @@ -16,10 +16,17 @@ FROM filer-docker-registry.crto.in/criteo-centos-base:0.1.0-3325-gadc8d652 +ARG USER_NAME +ARG USER_ID +ARG GROUP_ID + WORKDIR /root SHELL ["/bin/bash", "-o", "pipefail", "-c"] +# Without the following plugin yum fails to install packages with +# Rpmdb checksum is invalid: dCDPT(pkg checksums).... +RUN yum -y install yum-plugin-ovl RUN yum groupinstall -y 'Development Tools' RUN yum install -y \ ant \ @@ -50,10 +57,8 @@ RUN yum install -y \ java-1.8.0-openjdk \ pinentry-curses \ pkgconfig \ - python3 \ python3-pip \ - python3-setuptools \ - python3-wheel \ + python3-virtualenv \ rsync \ ShellCheck \ sudo \ @@ -61,12 +66,11 @@ RUN yum install -y \ wget \ zlib-devel -# Install maven 3 from source -RUN rm -rf /usr/share/maven \ - && wget --no-check-certificate https://downloads.apache.org/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz \ - && tar xf apache-maven-3.5.4-bin.tar.gz -C /usr/share \ - && ln -s /usr/share/apache-maven-3.5.4 /usr/share/maven - +# we need maven 3.8.8 +RUN rm -rf /usr/share/maven \ + && wget --no-check-certificate https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz \ + && tar xf apache-maven-3.8.8-bin.tar.gz -C /usr/share \ + && ln -s /usr/share/apache-maven-3.8.8 /usr/share/maven ###### # Set env vars required to build Hadoop @@ -79,3 +83,17 @@ ENV JAVA_HOME /etc/alternatives/java_sdk # Avoid out of memory errors in builds ### ENV MAVEN_OPTS -Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g + + +RUN groupadd --non-unique -g ${GROUP_ID} ${USER_NAME} +RUN useradd -l -g ${GROUP_ID} -u ${USER_ID} -k /root -m ${USER_NAME} +RUN echo "${USER_NAME} ALL=NOPASSWD: ALL" > "/etc/sudoers.d/spark-build-${USER_ID}" +ENV HOME /home/${USER_NAME} +RUN mkdir /home/${USER_NAME}/.m2 && chown ${USER_NAME}: /home/${USER_NAME}/.m2 +RUN echo 'criteo*http://nexus.criteo.prod/content/groups/criteodevcriteo${criteo.repo.username}${criteo.repo.password}' > /home/${USER_NAME}/.m2/settings.xml + +# Alias python3 to python otherwise python 2 is called +RUN mv /usr/bin/python /usr/bin/python2 +RUN ln -s /usr/bin/python3 /usr/bin/python + +RUN rm -f /var/log/faillog /var/log/lastlog \ No newline at end of file diff --git a/external/docker/criteo-build/build_config.sh b/external/docker/criteo-build/build_config.sh new file mode 100644 index 0000000000000..629b5cd2ff50b --- /dev/null +++ b/external/docker/criteo-build/build_config.sh @@ -0,0 +1,2 @@ +HDP_VERSION=3.3.0-criteo-20230320100819 +HIVE_VERSION=2.3.9 \ No newline at end of file diff --git a/external/docker/criteo-build/build_script.sh b/external/docker/criteo-build/build_script.sh new file mode 100644 index 0000000000000..555e5819c2939 --- /dev/null +++ b/external/docker/criteo-build/build_script.sh @@ -0,0 +1,137 @@ +set -x +set -e + +MAVEN_USER=$1 +MAVEN_PASSWORD=$2 +SCALA_RELEASE=$3 +SPARK_RELEASE=$4 +NEXUS_ARTIFACT_URL=$5 +NEXUS_PYPY_URL=$6 +TIMESTAMP=$7 + +for var in "$MAVEN_USER" "$MAVEN_PASSWORD" "$SCALA_RELEASE" "$SPARK_RELEASE" "$NEXUS_ARTIFACT_URL" "$NEXUS_PYPY_URL" "$TIMESTAMP"; do + if [ -z "$var" ]; then + echo "Missing arguments" + exit 1 + fi +done + +TWINE_USERNAME=$MAVEN_USER +TWINE_PASSWORD=$MAVEN_PASSWORD + +# Load HDP_VERSION and HIVE_VERSION +source external/docker/criteo-build/build_config.sh + +deploy_python() +{ + pyspark_version=$1 + sed -i "s/__version__: str = \\\".*\\\"/__version__: str = \\\"${pyspark_version}\\\"/g" python/pyspark/version.py + python -m venv venv + source venv/bin/activate + pip install --upgrade pip + pip install -r python/requirements.txt + cd python + python setup.py bdist_wheel + twine upload dist/pyspark*whl -u ${TWINE_USERNAME} -p ${TWINE_PASSWORD} --skip-existing --repository-url "${NEXUS_PYPY_URL}/" + python setup.py clean --all + cd $OLDPWD +} + +VERSION_SUFFIX="criteo-${TIMESTAMP}" + +if [ ${SCALA_RELEASE} == "2.12" ]; then + ./dev/change-scala-version.sh 2.12 + MVN_SCALA_PROPERTY="-Pscala-2.12" +elif [ ${SCALA_RELEASE} == "2.11" ]; then + ./dev/change-scala-version.sh 2.11 + MVN_SCALA_PROPERTY="-Pscala-2.11" +else + echo "[ERROR] Scala release not provided" + exit 1 +fi + +SPARK_VERSION="$(mvn org.apache.maven.plugins:maven-help-plugin:evaluate -Dexpression=project.version -q -DforceStdout)" +CRITEO_VERSION="${SPARK_VERSION}-${VERSION_SUFFIX}" +SPARK_ARTIFACT_FILE="spark-${CRITEO_VERSION}-bin-${SCALA_RELEASE}.tgz" +SPARK_HDP_ARTIFACT_FILE="spark-${CRITEO_VERSION}-bin-${SCALA_RELEASE}-${HDP_VERSION}.tgz" +SPARK_JARS_ARTIFACT_FILE="spark-${CRITEO_VERSION}-jars-${SCALA_RELEASE}.tgz" +MVN_ARTIFACT_VERSION="${CRITEO_VERSION}-${SCALA_RELEASE}" +MVN_HDP_ARTIFACT_VERSION="${MVN_ARTIFACT_VERSION}-hadoop-${HDP_VERSION}" +PYTHON_PEX_VERSION="${SPARK_RELEASE}+criteo.scala.${SCALA_RELEASE}.${TIMESTAMP}" +PYTHON_HDP_PEX_VERSION="${SPARK_RELEASE}+criteo.scala.${SCALA_RELEASE}.hadoop.${HDP_VERSION}.${TIMESTAMP}" +SHUFFLE_SERVICE_JAR_FILE="dist/yarn/spark-${CRITEO_VERSION}-yarn-shuffle.jar" +MVN_COMMON_PROPERTIES="-Phive-provided -Phive-thriftserver -Pyarn -Dhive.version=${HIVE_VERSION} -Dhadoop.version=${HDP_VERSION} ${MVN_SCALA_PROPERTY}" +MVN_COMMON_DEPLOY_FILE_PROPERTIES="-Durl=${NEXUS_ARTIFACT_URL} -DrepositoryId=criteo -Dcriteo.repo.username=${MAVEN_USER} -Dcriteo.repo.password=${MAVEN_PASSWORD} -DretryFailedDeploymentCount=3" + +# do some house cleaning +mvn --no-transfer-progress clean +rm -f spark-*.tgz +rm -f dist/python/dist/* +rm -f python/dist/* + +# change version +mvn --no-transfer-progress versions:set -DnewVersion=${CRITEO_VERSION} + +# Build distribution with hadoop +./dev/make-distribution.sh --pip --name ${SCALA_RELEASE}-${HDP_VERSION} --tgz -ntp ${MVN_COMMON_PROPERTIES} + +# tgz artifact deployment +mvn deploy:deploy-file \ + --batch-mode \ + -DgroupId=com.criteo.tarballs \ + -DartifactId=spark \ + -Dversion=${MVN_HDP_ARTIFACT_VERSION} \ + -Dpackaging=tar.gz \ + -Dfile=${SPARK_HDP_ARTIFACT_FILE} \ + ${MVN_COMMON_DEPLOY_FILE_PROPERTIES} + +deploy_python $PYTHON_HDP_PEX_VERSION + +# Build distribution without hadoop +./dev/make-distribution.sh --pip --name ${SCALA_RELEASE} --tgz -ntp ${MVN_COMMON_PROPERTIES} -Phadoop-provided +# tgz artifact deployment +mvn deploy:deploy-file \ + --batch-mode \ + -DgroupId=com.criteo.tarballs \ + -DartifactId=spark \ + -Dversion=${MVN_ARTIFACT_VERSION} \ + -Dpackaging=tar.gz \ + -Dfile=${SPARK_ARTIFACT_FILE} \ + ${MVN_COMMON_DEPLOY_FILE_PROPERTIES} + +# Create archive with jars only +cd dist/jars && tar -czf ${OLDPWD}/${SPARK_JARS_ARTIFACT_FILE} *.jar; cd $OLDPWD + +# Deploy tgz jars only artifact +mvn deploy:deploy-file \ + --batch-mode \ + -DgroupId=com.criteo.tarballs \ + -DartifactId=spark-jars \ + -Dversion=${MVN_ARTIFACT_VERSION} \ + -Dpackaging=tar.gz \ + -Dfile=${SPARK_JARS_ARTIFACT_FILE} \ + ${MVN_COMMON_DEPLOY_FILE_PROPERTIES} + +# shuffle service deployment +mvn deploy:deploy-file \ + --batch-mode \ + -DgroupId=org.apache.spark \ + -DartifactId=yarn-shuffle_${SCALA_RELEASE} \ + -Dversion=${CRITEO_VERSION} \ + -Dpackaging=jar \ + -Dfile=${SHUFFLE_SERVICE_JAR_FILE} \ + ${MVN_COMMON_DEPLOY_FILE_PROPERTIES} + +# jar artifacts (for parent poms) deployment +mvn deploy \ + --batch-mode \ + ${MVN_COMMON_PROPERTIES} \ + -Phadoop-provided \ + -DaltDeploymentRepository=criteo::default::${NEXUS_ARTIFACT_URL} \ + -Dcriteo.repo.username=${MAVEN_USER} \ + -Dcriteo.repo.password=${MAVEN_PASSWORD} \ + -DskipTests + + +# python deployment +deploy_python $PYTHON_PEX_VERSION \ No newline at end of file diff --git a/python/requirements.txt b/python/requirements.txt new file mode 100644 index 0000000000000..2e9d1873f3940 --- /dev/null +++ b/python/requirements.txt @@ -0,0 +1,8 @@ +wheel +numpy +pandas +pypandoc==1.5 +py4j==0.10.7 +pyarrow +twine +cryptography==3.3.1