Usage: Setup the PySpark env (need Java 8)
Run : "source ~/configure_pyspark.sh"
Confirm : echo $JAVA_HOME java -version
pipenv shell
- !apt-get install openjdk-8-jdk-headless -qq > /dev/null
- !wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
- !tar xf spark-2.4.4-bin-hadoop2.7.tgz
- !pip install -q findspark
import os
- os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
- os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"
import findspark
findspark.init()
from pyspark import SparkConf
conf:SparkConf = SparkConf().setAppName("pyspark-local")
#
# Refer AWS-SDK dependency: https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws/3.3.1
#
conf.set('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901')
#
# OR use:
# import os
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages "org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.901" pyspark-shell'
#
# Refer: https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Authenticating_with_S3 for Hadoop S3A file system
# e.g. conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider')
#
# To use public S3 - which does not need any credentials:
# conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.AnonymousAWSCredentialsProvider')
#
# To use AWS profile:
# export AWS_PROFILE="myProfile"
# conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'com.amazonaws.auth.profile.ProfileCredentialsProvider')
#
conf.set('com.amazonaws.services.s3.enableV4', 'true')
conf.set('spark.hadoop.fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')
conf.set('spark.hadoop.fs.s3a.aws.credentials.provider', 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider')
conf.set('spark.hadoop.fs.s3a.access.key', <access_key>)
conf.set('spark.hadoop.fs.s3a.secret.key', <secret_key>)
conf.set('spark.hadoop.fs.s3a.session.token', <token>)
from pyspark import SparkContext
sc:SparkContext = SparkContext.getOrCreate(conf=conf)
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.build.config(conf=conf).getOrCreate()
spark-submit --conf "spark.driver.extraJavaOptions=-XX:+UseG1GC -verbose:gc -XX:+PrintGCTimeStamps -XX:InitiatingHeapOccupancyPercent=35 -Dlinear.properties.file=./proposal/proposal-conformance-LTS-dev.properties -DexecutionDate=2020/09/16" --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:InitiatingHeapOccupancyPercent=35 -Dlinear.properties.file=./proposal/proposal-conformance-LTS-dev.properties -DexecutionDate=2020/09/16" --name linear-proposalheader-conformed-dev --conf spark.driver.memory=12g --conf spark.driver.cores=2 --conf spark.executor.memory=25g --conf spark.executor.cores=3 --conf spark.kryo.unsafe=true --conf spark.kryoserializer.buffer=300M --conf spark.kryoserializer.buffer.max=1024M --conf spark.task.maxFailures=10 --conf spark.yarn.executor.memoryOverhead=5120m --conf spark.dynamicAllocation.enabled=true --conf spark.dynamicAllocation.initialExecutors=75 --conf spark.executor.extraClassPath=/usr/lib/spark/jars/ --master yarn --deploy-mode cluster --class com.dtci.linear.core.spark.SparkApplication s3://dp-repository-dev/dp-linear-conformance/dp-linear-conformation-Airflow-Oct15__Sept15_as_DayOne.jar
Refer: Pyspark Mongodb connector
./bin/pyspark --conf "spark.mongodb.input.uri=mongodb://127.0.0.1/test.myCollection?readPreference=primaryPreferred" \
--conf "spark.mongodb.output.uri=mongodb://127.0.0.1/test.myCollection" \
--packages org.mongodb.spark:mongo-spark-connector_2.12:3.0.1
Refer: Spark_on_Kubernetes.ppt for Spark on EKS