From d20ba6c0f19148a44a5a18a1c07fdaf915d706e1 Mon Sep 17 00:00:00 2001
From: Baohe Zhang <baohe.zhang@verizonmedia.com>
Date: Tue, 2 Jun 2020 12:45:06 -0500
Subject: [PATCH] [YSPARK-1595] Move TestSparkDistributedCache to spark-starter
 (#21)

* Add spark distribued cache oozie example

* Delete script and update workflow

* Update distributed cache to access file from hdfs

* Add README and update sparkTag
---
 src/main/resources/data/firstarchive.tgz      | Bin 0 -> 137 bytes
 src/main/resources/data/firstfile.txt         |   1 +
 src/main/resources/data/secondarchive.tgz     | Bin 0 -> 138 bytes
 src/main/resources/data/secondfile.txt        |   1 +
 src/main/resources/data/singlearchive.tgz     | Bin 0 -> 139 bytes
 src/main/resources/data/singlefile.txt        |   1 +
 src/main/resources/data/thirdarchive.tgz      | Bin 0 -> 137 bytes
 src/main/resources/data/thirdfile.txt         |   1 +
 .../oozie/spark_distributed_cache/README.md   |  17 ++
 .../spark_distributed_cache/job.properties    |   6 +
 .../spark_distributed_cache/workflow.xml      | 276 ++++++++++++++++++
 .../SparkDistributedCacheSingleArchive.scala  |  37 +++
 .../SparkDistributedCacheSingleFile.scala     |  37 +++
 .../SparkDistributedCacheThreeArchives.scala  |  42 +++
 .../SparkDistributedCacheThreeFiles.scala     |  41 +++
 15 files changed, 460 insertions(+)
 create mode 100644 src/main/resources/data/firstarchive.tgz
 create mode 100644 src/main/resources/data/firstfile.txt
 create mode 100644 src/main/resources/data/secondarchive.tgz
 create mode 100644 src/main/resources/data/secondfile.txt
 create mode 100644 src/main/resources/data/singlearchive.tgz
 create mode 100644 src/main/resources/data/singlefile.txt
 create mode 100644 src/main/resources/data/thirdarchive.tgz
 create mode 100644 src/main/resources/data/thirdfile.txt
 create mode 100644 src/main/resources/oozie/spark_distributed_cache/README.md
 create mode 100644 src/main/resources/oozie/spark_distributed_cache/job.properties
 create mode 100644 src/main/resources/oozie/spark_distributed_cache/workflow.xml
 create mode 100644 src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleArchive.scala
 create mode 100644 src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleFile.scala
 create mode 100644 src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeArchives.scala
 create mode 100644 src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeFiles.scala
diff --git a/src/main/resources/data/firstarchive.tgz b/src/main/resources/data/firstarchive.tgz
new file mode 100644
index 0000000000000000000000000000000000000000..e37a48afe109ba103ad79ff391ba786fa7981636
GIT binary patch
literal 137
zcmb2|=3r>z`5Mf?{Pvt7-ys75m%!_D4ykvP?e(jfC&Jg|lB(bFSXg6%#sY^fflKvw
zGnUOr&AmJQYY6-PsvGS&%jPt%xHaXp@@rfB$O|c{A)6O%d#mHg8Cdq~^P@Ypd4_EE
k+kOW9Zja4<e|!JbCm&vOGa!Qo{W}alB4lPVXfQAU0Dts55&!@I

literal 0
HcmV?d00001

diff --git a/src/main/resources/data/firstfile.txt b/src/main/resources/data/firstfile.txt
new file mode 100644
index 0000000000000..209e3ef4b6247
--- /dev/null
+++ b/src/main/resources/data/firstfile.txt
@@ -0,0 +1 @@
+20
diff --git a/src/main/resources/data/secondarchive.tgz b/src/main/resources/data/secondarchive.tgz
new file mode 100644
index 0000000000000000000000000000000000000000..f3650017fcfce1e1a507efa700b837d2286316b4
GIT binary patch
literal 138
zcmb2|=3waM`5Mf?{Pvt97n7lY%f-0rt~n07-ue{Hb6m`^XoAbfAANc(;=2Uc1pnO@
z4eicKt+mSA_S-q~TlVzG7b-_JvYu%^bKh27SF~~J^MiLbFUx%Xu9;=-^K+|<UnlcA
k{#w^J{gT}quiE?nbgv&>ug`!2_7yOnIjN$;puxZZ0M2DQm;e9(

literal 0
HcmV?d00001

diff --git a/src/main/resources/data/secondfile.txt b/src/main/resources/data/secondfile.txt
new file mode 100644
index 0000000000000..64bb6b746dcea
--- /dev/null
+++ b/src/main/resources/data/secondfile.txt
@@ -0,0 +1 @@
+30
diff --git a/src/main/resources/data/singlearchive.tgz b/src/main/resources/data/singlearchive.tgz
new file mode 100644
index 0000000000000000000000000000000000000000..1ecf02129b1aa34c145e0e07eb04bdd98a8e7206
GIT binary patch
literal 139
zcmb2|=3rRN^EH@(`R%!lTug=nEf3e$cG)z(QG5Nga$U#qgNYMeK7O>-W#MM+c5eCq
zx@~6Ct(sZWisnq;^!~cu><d>P=w(e~JKvx4{m+vm!_@Pd?$OoWF9lRzn3c!Pjh4^m
mxU}3V&g=IE?XR`JxBHZSf6dQ;3>x0wWmpx_*~p;5zyJU##XaZ%

literal 0
HcmV?d00001

diff --git a/src/main/resources/data/singlefile.txt b/src/main/resources/data/singlefile.txt
new file mode 100644
index 0000000000000..29d6383b52c13
--- /dev/null
+++ b/src/main/resources/data/singlefile.txt
@@ -0,0 +1 @@
+100
diff --git a/src/main/resources/data/thirdarchive.tgz b/src/main/resources/data/thirdarchive.tgz
new file mode 100644
index 0000000000000000000000000000000000000000..6284a890bbf267aa2dbcc690da2294ae05f0cc60
GIT binary patch
literal 137
zcmb2|=3to2^EH@(`R%!lT!$P4S{|;g?W$(lc{Z--yzq&cENk>u|M;OM*3>HIB=GS6
zuEb>$^PgGGc~)%s>ujog;nk%ofme4;k&oQ|CuLFZ)3CEei(j8JS>3YZ@_g~>M}JE-
k{fPLT9clN*tM>lCwPtd4wTukN;6tl*PEjfwg9ZZw0K<np5&!@I

literal 0
HcmV?d00001

diff --git a/src/main/resources/data/thirdfile.txt b/src/main/resources/data/thirdfile.txt
new file mode 100644
index 0000000000000..e373ee695f6e7
--- /dev/null
+++ b/src/main/resources/data/thirdfile.txt
@@ -0,0 +1 @@
+50
diff --git a/src/main/resources/oozie/spark_distributed_cache/README.md b/src/main/resources/oozie/spark_distributed_cache/README.md
new file mode 100644
index 0000000000000..c07184dda46dc
--- /dev/null
+++ b/src/main/resources/oozie/spark_distributed_cache/README.md
@@ -0,0 +1,17 @@
+Instructions for running this oozie application:
+
+- create a directory `spark_distributed_cache/` in HDFS for the oozie application.
+
+- upload `workflow.xml` to `spark_distributed_cache/apps/spark/`.
+
+- use `mvn clean package` to create the jar package of spark-starter if you haven't done so.
+
+- upload the jar package `spark-starter/target/spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar` to `spark_distributed_cache/apps/lib/`.
+
+- upload resource files `singlefile.txt, firstfile.txt, secondfile.txt, thirdfile.txt, singlearchive.tgz, firstarchive.tgz, secondarchive.tgz, thirdarchive.tgz` in `spark-starter/src/main/resources/data/` to `spark_distributed_cache/data/`.
+
+- update `nameNode` and `jobTracker` in `job.properties` if you are running on the cluster other than AR.
+
+- export OOZIE_URL, for example, `export OOZIE_URL=https://axonitered-oozie.red.ygrid.yahoo.com:4443/oozie/`.
+
+- submit the oozie job using `oozie job -run -config job.properties -auth KERBEROS`
diff --git a/src/main/resources/oozie/spark_distributed_cache/job.properties b/src/main/resources/oozie/spark_distributed_cache/job.properties
new file mode 100644
index 0000000000000..22c6952a54449
--- /dev/null
+++ b/src/main/resources/oozie/spark_distributed_cache/job.properties
@@ -0,0 +1,6 @@
+nameNode=hdfs://axonitered-nn1.red.ygrid.yahoo.com:8020
+jobTracker=axonitered-jt1.red.ygrid.yahoo.com:8032
+wfRoot=spark_distributed_cache
+sparkTag=spark_latest
+oozie.libpath=/user/${user.name}/${wfRoot}/apps/lib
+oozie.wf.application.path=${nameNode}/user/${user.name}/${wfRoot}/apps/spark
diff --git a/src/main/resources/oozie/spark_distributed_cache/workflow.xml b/src/main/resources/oozie/spark_distributed_cache/workflow.xml
new file mode 100644
index 0000000000000..974d46a4c4511
--- /dev/null
+++ b/src/main/resources/oozie/spark_distributed_cache/workflow.xml
@@ -0,0 +1,276 @@
+<workflow-app xmlns='uri:oozie:workflow:0.5' name='SparkDistribuedCacheOozieTest'>
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+    </global>
+
+    <start to='SparkDistributedCacheOneFile' />
+
+    <action name='SparkDistributedCacheOneFile'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneFile</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleFile</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>singlefile.txt</arg>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlefile.txt</file>
+        </spark>
+        <ok to="SparkDistributedCacheOneFileWithHash" />
+        <error to="fail" />
+    </action>
+
+    <action name='SparkDistributedCacheOneFileWithHash'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneFileWithHash</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleFile</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>renamed.txt</arg>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlefile.txt#renamed.txt</file>
+        </spark>
+        <ok to="SparkDistributedCacheThreeFiles" />
+        <error to="fail" />
+    </action>
+
+    <action name='SparkDistributedCacheThreeFiles'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheThreeFiles</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheThreeFiles</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>firstfile.txt</arg>
+            <arg>renamedfile.txt</arg>
+            <arg>thirdfile.txt</arg>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/firstfile.txt</file>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/secondfile.txt#renamedfile.txt</file>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/thirdfile.txt</file>
+        </spark>
+        <ok to="SparkDistributedCacheOneFileHashBadFile" />
+        <error to="fail" />
+    </action>
+
+    <action name='SparkDistributedCacheOneFileHashBadFile'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneFileHashBadFile</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleFile</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>singlefile.txt</arg>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlefile.txt#badfile.txt</file>
+        </spark>
+        <ok to="fail" />
+        <error to="SparkDistributedCacheNonExistFile" />
+    </action>
+
+    <action name='SparkDistributedCacheNonExistFile'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheNonExistFile</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleFile</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>nonexistentfile.txt</arg>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/nonexistenfile.txt</file>
+        </spark>
+        <ok to="fail" />
+        <error to="SparkDistributedCacheOneFileFromHdfs" />
+    </action>
+
+    <action name='SparkDistributedCacheOneFileFromHdfs'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneFileFromHdfs</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleFile</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>singlefile.txt</arg>
+            <file>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlefile.txt</file>
+        </spark>
+        <ok to="SparkDistributedCacheOneArchive" />
+        <error to="fail" />
+    </action>
+
+    <action name='SparkDistributedCacheOneArchive'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneArchive</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleArchive</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>singlearchive.tgz</arg>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlearchive.tgz</archive>
+        </spark>
+        <ok to="SparkDistributedCacheOneArchiveWithHash" />
+        <error to="fail" />
+    </action>
+
+    <action name='SparkDistributedCacheOneArchiveWithHash'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneArchiveWithHash</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleArchive</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>renamed.tgz</arg>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlearchive.tgz#renamed.tgz</archive>
+        </spark>
+        <ok to="SparkDistributedCacheThreeArchives" />
+        <error to="fail" />
+    </action>
+
+    <action name='SparkDistributedCacheThreeArchives'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheThreeArchives</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheThreeArchives</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>firstarchive.tgz</arg>
+            <arg>renamedarchive.tgz</arg>
+            <arg>thirdarchive.tgz</arg>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/firstarchive.tgz</archive>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/secondarchive.tgz#renamedarchive.tgz</archive>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/thirdarchive.tgz</archive>
+        </spark>
+        <ok to="SparkDistributedCacheOneArchiveHashBad" />
+        <error to="fail" />
+    </action>
+
+    <action name='SparkDistributedCacheOneArchiveHashBad'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneArchiveHashBad</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleArchive</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>singlearchive.tgz</arg>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlearchive.tgz#badfile.tgz</archive>
+        </spark>
+        <ok to="fail" />
+        <error to="SparkDistributedCacheNonExistArchive" />
+    </action>
+
+    <action name='SparkDistributedCacheNonExistArchive'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheNonExistArchive</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleArchive</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>nonexistenarchive.tgz</arg>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/nonexistenarchive.tgz</archive>
+        </spark>
+        <ok to="fail" />
+        <error to="SparkDistributedCacheOneArchiveFromHdfs" />
+    </action>
+
+    <action name='SparkDistributedCacheOneArchiveFromHdfs'>
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <configuration>
+                <property>
+                    <name>oozie.action.sharelib.for.spark</name>
+                    <value>${sparkTag}</value>
+                </property>
+            </configuration>
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>SparkDistributedCacheOneArchiveFromHdfs</name>
+            <class>com.yahoo.spark.starter.distributedcache.SparkDistributedCacheSingleArchive</class>
+            <jar>spark-starter-2.0-SNAPSHOT-jar-with-dependencies.jar</jar>
+            <spark-opts>--num-executors 3 --executor-memory 2g --executor-cores 1 --queue default</spark-opts>
+            <arg>singlearchive.tgz</arg>
+            <archive>hdfs:///user/${wf:conf('user.name')}/${wfRoot}/data/singlearchive.tgz</archive>
+        </spark>
+        <ok to="end" />
+        <error to="fail" />
+    </action>
+
+
+    <kill name="fail">
+        <message>Workflow failed, error
+            message[${wf:errorMessage(wf:lastErrorNode())}]
+        </message>
+    </kill>
+    <end name='end' />
+</workflow-app>
diff --git a/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleArchive.scala b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleArchive.scala
new file mode 100644
index 0000000000000..f142827491834
--- /dev/null
+++ b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleArchive.scala
@@ -0,0 +1,37 @@
+package com.yahoo.spark.starter.distributedcache
+
+
+import org.apache.spark._
+import java.io.{FileReader, BufferedReader}
+
+object SparkDistributedCacheSingleArchive {
+  def main(args: Array[String]) {
+    if (args.length < 1) {
+      System.err.println("Usage: SparkDistributedCacheSingleArchive <inputfile>")
+      System.exit(1)
+    }
+    val conf = new SparkConf().setAppName("SparkDistributedCacheSingleArchive ")
+    val spark = new SparkContext(conf)
+
+    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val result = spark.parallelize(testData).reduceByKey {
+      // archive expected which contains a file named singlefile.txt which contain single value of 100 
+      val in = new BufferedReader(new FileReader(args(0) + "/singlefile.txt"))
+      val fileVal = in.readLine().toInt
+      in.close()
+      _ * fileVal + _ * fileVal
+    }.collect()
+    println("result is: " + result)
+    val pass = (result.toSet == Set((1,200), (2,300), (3,500)))
+    println("pass is: " + pass)
+
+    if (!pass) {
+      println("Error, set isn't as expected")
+      spark.stop()
+      // we have to throw for the spark application master to mark app as failed
+      throw new Exception("Error, set isn't as expected")
+      System.exit(1)
+    }
+    spark.stop()
+  }
+}
diff --git a/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleFile.scala b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleFile.scala
new file mode 100644
index 0000000000000..a1d971b0c9935
--- /dev/null
+++ b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheSingleFile.scala
@@ -0,0 +1,37 @@
+package com.yahoo.spark.starter.distributedcache
+
+
+import org.apache.spark._
+import java.io.{FileReader, BufferedReader}
+
+object SparkDistributedCacheSingleFile {
+  def main(args: Array[String]) {
+    if (args.length < 1) {
+      System.err.println("Usage: SparkDistributedCacheSingleFile <inputfile>")
+      System.exit(1)
+    }
+    val conf = new SparkConf().setAppName("SparkDistributedCacheSingleFile")
+    val spark = new SparkContext(conf)
+
+    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val result = spark.parallelize(testData).reduceByKey {
+      // file expected to contain single value of 100 
+      val in = new BufferedReader(new FileReader(args(0)))
+      val fileVal = in.readLine().toInt
+      in.close()
+      _ * fileVal + _ * fileVal
+    }.collect()
+    println("result is: " + result)
+    val pass = (result.toSet == Set((1,200), (2,300), (3,500)))
+    println("pass is: " + pass)
+
+    if (!pass) {
+      println("Error, set isn't as expected")
+      spark.stop()
+      // we have to throw for the spark application master to mark app as failed
+      throw new Exception("Error, set isn't as expected")
+      System.exit(1)
+    }
+    spark.stop()
+  }
+}
diff --git a/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeArchives.scala b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeArchives.scala
new file mode 100644
index 0000000000000..3abfc86cdd7b4
--- /dev/null
+++ b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeArchives.scala
@@ -0,0 +1,42 @@
+package com.yahoo.spark.starter.distributedcache
+
+
+import org.apache.spark._
+import java.io.{FileReader, BufferedReader}
+
+object SparkDistributedCacheThreeArchives {
+  def main(args: Array[String]) {
+    if (args.length < 1) {
+      System.err.println("Usage: SparkDistributedCacheThreeArchives <inputarchive> <inputarchive2> <inputarchive3>")
+      System.exit(1)
+    }
+    val conf = new SparkConf().setAppName("SparkDistributedCacheThreeArchives")
+    val spark = new SparkContext(conf)
+
+    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val result = spark.parallelize(testData).reduceByKey {
+      // archives expected to contain single files name firstfile.txt, secondfile.txt, 
+      // thirdfile.txt that have the value of 20, 30, 50
+      val in = new BufferedReader(new FileReader(args(0) + "/firstfile.txt"))
+      val fileVal = in.readLine().toInt
+      val in2 = new BufferedReader(new FileReader(args(1) + "/secondfile.txt"))
+      val fileVal2 = in2.readLine().toInt
+      val in3 = new BufferedReader(new FileReader(args(2) + "/thirdfile.txt"))
+      val fileVal3 = in3.readLine().toInt
+      in.close()
+      _ * (fileVal + fileVal2 + fileVal3) + _ * (fileVal + fileVal2 + fileVal3)
+    }.collect()
+    println("result is: " + result)
+    val pass = (result.toSet == Set((1,200), (2,300), (3,500)))
+    println("pass is: " + pass)
+
+    if (!pass) {
+      println("Error, set isn't as expected")
+      spark.stop()
+      // we have to throw for the spark application master to mark app as failed
+      throw new Exception("Error, set isn't as expected")
+      System.exit(1)
+    }
+    spark.stop()
+  }
+}
diff --git a/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeFiles.scala b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeFiles.scala
new file mode 100644
index 0000000000000..b5f11097c8010
--- /dev/null
+++ b/src/main/scala/com/yahoo/spark/starter/distributedcache/SparkDistributedCacheThreeFiles.scala
@@ -0,0 +1,41 @@
+package com.yahoo.spark.starter.distributedcache
+
+
+import org.apache.spark._
+import java.io.{FileReader, BufferedReader}
+
+object SparkDistributedCacheThreeFiles {
+  def main(args: Array[String]) {
+    if (args.length < 1) {
+      System.err.println("Usage: SparkDistributedCacheThreeFiles <inputfile> <inputfile2> <inputfile3>")
+      System.exit(1)
+    }
+    val conf = new SparkConf().setAppName("SparkDistributedCacheThreeFiles")
+    val spark = new SparkContext(conf)
+
+    val testData = Array((1,1), (1,1), (2,1), (3,5), (2,2), (3,0))
+    val result = spark.parallelize(testData).reduceByKey {
+      // files expected to contain single value of 20, 30, 50
+      val in = new BufferedReader(new FileReader(args(0)))
+      val fileVal = in.readLine().toInt
+      val in2 = new BufferedReader(new FileReader(args(1)))
+      val fileVal2 = in2.readLine().toInt
+      val in3 = new BufferedReader(new FileReader(args(2)))
+      val fileVal3 = in3.readLine().toInt
+      in.close()
+      _ * (fileVal + fileVal2 + fileVal3) + _ * (fileVal + fileVal2 + fileVal3)
+    }.collect()
+    println("result is: " + result)
+    val pass = (result.toSet == Set((1,200), (2,300), (3,500)))
+    println("pass is: " + pass)
+
+    if (!pass) {
+      println("Error, set isn't as expected")
+      spark.stop()
+      // we have to throw for the spark application master to mark app as failed
+      throw new Exception("Error, set isn't as expected")
+      System.exit(1)
+    }
+    spark.stop()
+  }
+}