diff --git a/.idea/vcs.xml b/.idea/vcs.xml
new file mode 100644
index 0000000000000..bdf0cdcbff5db
--- /dev/null
+++ b/.idea/vcs.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
\ No newline at end of file
diff --git a/external/docker/criteo-build/Dockerfile b/external/docker/criteo-build/Dockerfile
index 834529bc49bde..94348763b3f1b 100644
--- a/external/docker/criteo-build/Dockerfile
+++ b/external/docker/criteo-build/Dockerfile
@@ -16,10 +16,17 @@
FROM filer-docker-registry.crto.in/criteo-centos-base:0.1.0-3325-gadc8d652
+ARG USER_NAME
+ARG USER_ID
+ARG GROUP_ID
+
WORKDIR /root
SHELL ["/bin/bash", "-o", "pipefail", "-c"]
+# Without the following plugin yum fails to install packages with
+# Rpmdb checksum is invalid: dCDPT(pkg checksums)....
+RUN yum -y install yum-plugin-ovl
RUN yum groupinstall -y 'Development Tools'
RUN yum install -y \
ant \
@@ -50,10 +57,8 @@ RUN yum install -y \
java-1.8.0-openjdk \
pinentry-curses \
pkgconfig \
- python3 \
python3-pip \
- python3-setuptools \
- python3-wheel \
+ python3-virtualenv \
rsync \
ShellCheck \
sudo \
@@ -61,12 +66,11 @@ RUN yum install -y \
wget \
zlib-devel
-# Install maven 3 from source
-RUN rm -rf /usr/share/maven \
- && wget --no-check-certificate https://downloads.apache.org/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.tar.gz \
- && tar xf apache-maven-3.5.4-bin.tar.gz -C /usr/share \
- && ln -s /usr/share/apache-maven-3.5.4 /usr/share/maven
-
+# we need maven 3.8.8
+RUN rm -rf /usr/share/maven \
+ && wget --no-check-certificate https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz \
+ && tar xf apache-maven-3.8.8-bin.tar.gz -C /usr/share \
+ && ln -s /usr/share/apache-maven-3.8.8 /usr/share/maven
######
# Set env vars required to build Hadoop
@@ -79,3 +83,17 @@ ENV JAVA_HOME /etc/alternatives/java_sdk
# Avoid out of memory errors in builds
###
ENV MAVEN_OPTS -Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g
+
+
+RUN groupadd --non-unique -g ${GROUP_ID} ${USER_NAME}
+RUN useradd -l -g ${GROUP_ID} -u ${USER_ID} -k /root -m ${USER_NAME}
+RUN echo "${USER_NAME} ALL=NOPASSWD: ALL" > "/etc/sudoers.d/spark-build-${USER_ID}"
+ENV HOME /home/${USER_NAME}
+RUN mkdir /home/${USER_NAME}/.m2 && chown ${USER_NAME}: /home/${USER_NAME}/.m2
+RUN echo 'criteo*http://nexus.criteo.prod/content/groups/criteodevcriteo${criteo.repo.username}${criteo.repo.password}' > /home/${USER_NAME}/.m2/settings.xml
+
+# Alias python3 to python otherwise python 2 is called
+RUN mv /usr/bin/python /usr/bin/python2
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+RUN rm -f /var/log/faillog /var/log/lastlog
\ No newline at end of file
diff --git a/external/docker/criteo-build/build_config.sh b/external/docker/criteo-build/build_config.sh
new file mode 100644
index 0000000000000..629b5cd2ff50b
--- /dev/null
+++ b/external/docker/criteo-build/build_config.sh
@@ -0,0 +1,2 @@
+HDP_VERSION=3.3.0-criteo-20230320100819
+HIVE_VERSION=2.3.9
\ No newline at end of file
diff --git a/external/docker/criteo-build/build_script.sh b/external/docker/criteo-build/build_script.sh
new file mode 100644
index 0000000000000..555e5819c2939
--- /dev/null
+++ b/external/docker/criteo-build/build_script.sh
@@ -0,0 +1,137 @@
+set -x
+set -e
+
+MAVEN_USER=$1
+MAVEN_PASSWORD=$2
+SCALA_RELEASE=$3
+SPARK_RELEASE=$4
+NEXUS_ARTIFACT_URL=$5
+NEXUS_PYPY_URL=$6
+TIMESTAMP=$7
+
+for var in "$MAVEN_USER" "$MAVEN_PASSWORD" "$SCALA_RELEASE" "$SPARK_RELEASE" "$NEXUS_ARTIFACT_URL" "$NEXUS_PYPY_URL" "$TIMESTAMP"; do
+ if [ -z "$var" ]; then
+ echo "Missing arguments"
+ exit 1
+ fi
+done
+
+TWINE_USERNAME=$MAVEN_USER
+TWINE_PASSWORD=$MAVEN_PASSWORD
+
+# Load HDP_VERSION and HIVE_VERSION
+source external/docker/criteo-build/build_config.sh
+
+deploy_python()
+{
+ pyspark_version=$1
+ sed -i "s/__version__: str = \\\".*\\\"/__version__: str = \\\"${pyspark_version}\\\"/g" python/pyspark/version.py
+ python -m venv venv
+ source venv/bin/activate
+ pip install --upgrade pip
+ pip install -r python/requirements.txt
+ cd python
+ python setup.py bdist_wheel
+ twine upload dist/pyspark*whl -u ${TWINE_USERNAME} -p ${TWINE_PASSWORD} --skip-existing --repository-url "${NEXUS_PYPY_URL}/"
+ python setup.py clean --all
+ cd $OLDPWD
+}
+
+VERSION_SUFFIX="criteo-${TIMESTAMP}"
+
+if [ ${SCALA_RELEASE} == "2.12" ]; then
+ ./dev/change-scala-version.sh 2.12
+ MVN_SCALA_PROPERTY="-Pscala-2.12"
+elif [ ${SCALA_RELEASE} == "2.11" ]; then
+ ./dev/change-scala-version.sh 2.11
+ MVN_SCALA_PROPERTY="-Pscala-2.11"
+else
+ echo "[ERROR] Scala release not provided"
+ exit 1
+fi
+
+SPARK_VERSION="$(mvn org.apache.maven.plugins:maven-help-plugin:evaluate -Dexpression=project.version -q -DforceStdout)"
+CRITEO_VERSION="${SPARK_VERSION}-${VERSION_SUFFIX}"
+SPARK_ARTIFACT_FILE="spark-${CRITEO_VERSION}-bin-${SCALA_RELEASE}.tgz"
+SPARK_HDP_ARTIFACT_FILE="spark-${CRITEO_VERSION}-bin-${SCALA_RELEASE}-${HDP_VERSION}.tgz"
+SPARK_JARS_ARTIFACT_FILE="spark-${CRITEO_VERSION}-jars-${SCALA_RELEASE}.tgz"
+MVN_ARTIFACT_VERSION="${CRITEO_VERSION}-${SCALA_RELEASE}"
+MVN_HDP_ARTIFACT_VERSION="${MVN_ARTIFACT_VERSION}-hadoop-${HDP_VERSION}"
+PYTHON_PEX_VERSION="${SPARK_RELEASE}+criteo.scala.${SCALA_RELEASE}.${TIMESTAMP}"
+PYTHON_HDP_PEX_VERSION="${SPARK_RELEASE}+criteo.scala.${SCALA_RELEASE}.hadoop.${HDP_VERSION}.${TIMESTAMP}"
+SHUFFLE_SERVICE_JAR_FILE="dist/yarn/spark-${CRITEO_VERSION}-yarn-shuffle.jar"
+MVN_COMMON_PROPERTIES="-Phive-provided -Phive-thriftserver -Pyarn -Dhive.version=${HIVE_VERSION} -Dhadoop.version=${HDP_VERSION} ${MVN_SCALA_PROPERTY}"
+MVN_COMMON_DEPLOY_FILE_PROPERTIES="-Durl=${NEXUS_ARTIFACT_URL} -DrepositoryId=criteo -Dcriteo.repo.username=${MAVEN_USER} -Dcriteo.repo.password=${MAVEN_PASSWORD} -DretryFailedDeploymentCount=3"
+
+# do some house cleaning
+mvn --no-transfer-progress clean
+rm -f spark-*.tgz
+rm -f dist/python/dist/*
+rm -f python/dist/*
+
+# change version
+mvn --no-transfer-progress versions:set -DnewVersion=${CRITEO_VERSION}
+
+# Build distribution with hadoop
+./dev/make-distribution.sh --pip --name ${SCALA_RELEASE}-${HDP_VERSION} --tgz -ntp ${MVN_COMMON_PROPERTIES}
+
+# tgz artifact deployment
+mvn deploy:deploy-file \
+ --batch-mode \
+ -DgroupId=com.criteo.tarballs \
+ -DartifactId=spark \
+ -Dversion=${MVN_HDP_ARTIFACT_VERSION} \
+ -Dpackaging=tar.gz \
+ -Dfile=${SPARK_HDP_ARTIFACT_FILE} \
+ ${MVN_COMMON_DEPLOY_FILE_PROPERTIES}
+
+deploy_python $PYTHON_HDP_PEX_VERSION
+
+# Build distribution without hadoop
+./dev/make-distribution.sh --pip --name ${SCALA_RELEASE} --tgz -ntp ${MVN_COMMON_PROPERTIES} -Phadoop-provided
+# tgz artifact deployment
+mvn deploy:deploy-file \
+ --batch-mode \
+ -DgroupId=com.criteo.tarballs \
+ -DartifactId=spark \
+ -Dversion=${MVN_ARTIFACT_VERSION} \
+ -Dpackaging=tar.gz \
+ -Dfile=${SPARK_ARTIFACT_FILE} \
+ ${MVN_COMMON_DEPLOY_FILE_PROPERTIES}
+
+# Create archive with jars only
+cd dist/jars && tar -czf ${OLDPWD}/${SPARK_JARS_ARTIFACT_FILE} *.jar; cd $OLDPWD
+
+# Deploy tgz jars only artifact
+mvn deploy:deploy-file \
+ --batch-mode \
+ -DgroupId=com.criteo.tarballs \
+ -DartifactId=spark-jars \
+ -Dversion=${MVN_ARTIFACT_VERSION} \
+ -Dpackaging=tar.gz \
+ -Dfile=${SPARK_JARS_ARTIFACT_FILE} \
+ ${MVN_COMMON_DEPLOY_FILE_PROPERTIES}
+
+# shuffle service deployment
+mvn deploy:deploy-file \
+ --batch-mode \
+ -DgroupId=org.apache.spark \
+ -DartifactId=yarn-shuffle_${SCALA_RELEASE} \
+ -Dversion=${CRITEO_VERSION} \
+ -Dpackaging=jar \
+ -Dfile=${SHUFFLE_SERVICE_JAR_FILE} \
+ ${MVN_COMMON_DEPLOY_FILE_PROPERTIES}
+
+# jar artifacts (for parent poms) deployment
+mvn deploy \
+ --batch-mode \
+ ${MVN_COMMON_PROPERTIES} \
+ -Phadoop-provided \
+ -DaltDeploymentRepository=criteo::default::${NEXUS_ARTIFACT_URL} \
+ -Dcriteo.repo.username=${MAVEN_USER} \
+ -Dcriteo.repo.password=${MAVEN_PASSWORD} \
+ -DskipTests
+
+
+# python deployment
+deploy_python $PYTHON_PEX_VERSION
\ No newline at end of file
diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 0000000000000..2e9d1873f3940
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,8 @@
+wheel
+numpy
+pandas
+pypandoc==1.5
+py4j==0.10.7
+pyarrow
+twine
+cryptography==3.3.1