Skip to content

Commit

Permalink
0.4.0 - remove compiled python, build for spark 2.2 - 3.0, update sbt…
Browse files Browse the repository at this point in the history
… & packages, update README
  • Loading branch information
erikerlandson committed Jun 20, 2020
1 parent 62b6193 commit b400522
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 73 deletions.
41 changes: 25 additions & 16 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,24 @@ https://isarn.github.io/isarn-sketches-spark/latest/api/#org.isarnproject.sketch
## How to use in your project

``` scala
// Note that the version of spark and python is part of the release name.
// This example is for spark 2.2 and python 2.7:
libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.3.1-sp2.2-py2.7"
// Note that the version of spark is part of the release name.
// This example is for spark 2.4:
libraryDependencies += "org.isarnproject" %% "isarn-sketches-spark" % "0.4.0-sp2.4"
```

** Currently supported: python 2.7, 3.6 X spark 2.2, 2.3 X scala 2.11 **
Currently supported:

- spark 2.2, scala 2.11
- spark 2.3, scala 2.11
- spark 2.4, scala 2.11 and 2.12
- spark 3.0, scala 2.12

If you are interested in a python/spark/scala build that is not listed above, please contact me and/or file an issue!

Python code is also packaged with all of the artifacts above.
Spark will automatically extract and compile Python components for use with PySpark.
Python 2 and 3 are supported. Note that Python 2 is EOL as of January 2020.

This package builds against some `% Provided` Apache Spark dependencies:
```scala
libraryDependencies += "org.apache.spark" %% "spark-core" % sparkVersion
Expand All @@ -26,9 +35,9 @@ libraryDependencies += "org.apache.spark" %% "spark-mllib" % sparkVersion
## How to use from the Spark CLI
Several Spark CLI tools accept the `--packages` argument, as with this `spark-shell` example:
```bash
$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.11:0.3.1-sp2.3-py3.6"
$ spark-shell --packages "org.isarnproject:isarn-sketches-spark_2.12:0.4.0-sp2.4"
```
Note that you need to explicitly include the scala version as part of the package name
Note that you need to explicitly include the scala version as part of the package name.

## Examples

Expand Down Expand Up @@ -244,7 +253,7 @@ scala> td.show()
>>> from isarnproject.sketches.udaf.tdigest import *
>>> from random import gauss
>>> from pyspark.sql.types import *
>>> data = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())]))
>>> data = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())]))
>>> agg = data.agg(tdigestDoubleUDAF("x"))
>>> td = agg.first()[0]
>>> td.cdfInverse(0.5)
Expand All @@ -257,10 +266,10 @@ scala> td.show()
>>> from isarnproject.sketches.udaf.tdigest import *
>>> from random import gauss
>>> from pyspark.sql.types import *
>>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in xrange(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))]))
>>> data = sc.parallelize([[[gauss(0,1),gauss(0,1),gauss(0,1)]] for x in range(1000)]).toDF(StructType([StructField("x", ArrayType(DoubleType()))]))
>>> agg = data.agg(tdigestDoubleArrayUDAF("x"))
>>> tds = agg.first()[0]
>>> [t.cdfInverse(0.5) for t in td]
>>> [t.cdfInverse(0.5) for t in tds]
[0.046116924117141189, -0.011071666930287466, -0.019006033872431105]
>>>
```
Expand All @@ -271,7 +280,7 @@ scala> td.show()
>>> from random import gauss
>>> from pyspark.ml.linalg import VectorUDT, Vectors
>>> from pyspark.sql.types import *
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
>>> agg = data.agg(tdigestMLVecUDAF("x"))
>>> tds = agg.first()[0]
>>> [t.cdfInverse(0.5) for t in tds]
Expand All @@ -285,7 +294,7 @@ scala> td.show()
>>> from random import gauss
>>> from pyspark.mllib.linalg import VectorUDT, Vectors
>>> from pyspark.sql.types import *
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
>>> data = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
>>> agg = data.agg(tdigestMLLibVecUDAF("x"))
>>> tds = agg.first()[0]
>>> [t.cdfInverse(0.5) for t in tds]
Expand All @@ -298,8 +307,8 @@ scala> td.show()
>>> from isarnproject.sketches.udaf.tdigest import *
>>> from random import gauss
>>> from pyspark.sql.types import *
>>> x = sc.parallelize([[gauss(0,1)] for x in xrange(1000)]).toDF(StructType([StructField("x", DoubleType())]))
>>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())]))
>>> x = sc.parallelize([[gauss(0,1)] for x in range(1000)]).toDF(StructType([StructField("x", DoubleType())]))
>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())]))
>>> data = g.crossJoin(x)
>>> tds = data.groupBy("g").agg(tdigestDoubleUDAF("x").alias("tdigests"))
>>> tds.show()
Expand Down Expand Up @@ -330,8 +339,8 @@ scala> td.show()
>>> from random import gauss
>>> from pyspark.ml.linalg import VectorUDT, Vectors
>>> from pyspark.sql.types import *
>>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in xrange(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
>>> g = sc.parallelize([[1+x] for x in xrange(5)]).toDF(StructType([StructField("g", IntegerType())]))
>>> x = sc.parallelize([[Vectors.dense([gauss(0,1),gauss(0,1),gauss(0,1)])] for x in range(1000)]).toDF(StructType([StructField("x", VectorUDT())]))
>>> g = sc.parallelize([[1+x] for x in range(5)]).toDF(StructType([StructField("g", IntegerType())]))
>>> data = g.crossJoin(x)
>>> tds = data.groupBy("g").agg(tdigestMLVecUDAF("x").alias("tdigests"))
>>> tds.show()
Expand Down Expand Up @@ -422,7 +431,7 @@ scala> imp.show
>>> fiMod = fi.fit(training) \
... .setTargetModel(lrModel) \
... .setDeviationMeasure("rms-dev") \
... .setFeatureNames(["x%d" % (j) for j in xrange(10)])
... .setFeatureNames(["x%d" % (j) for j in range(10)])
>>> imp = fiMod.transform(training)
>>> imp.show()
+----+-------------------+
Expand Down
68 changes: 16 additions & 52 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -1,31 +1,26 @@
// xsbt clean unidoc previewSite
// xsbt clean unidoc ghpagesPushSite
// xsbt -Dsbt.global.base=/home/eje/.sbt/sonatype +publish
// make sure sparkVersion and pythonVersion are set as you want them prior to +publish
// xsbt +publish
// https://oss.sonatype.org
// make sure sparkVersion is set as you want prior to +publish

import scala.sys.process._

name := "isarn-sketches-spark"

organization := "org.isarnproject"

val packageVersion = "0.3.1"
val packageVersion = "0.4.0"

val sparkVersion = "2.2.2"

val pythonVersion = "2.7"
val sparkVersion = "3.0.0"

val sparkSuffix = s"""sp${sparkVersion.split('.').take(2).mkString(".")}"""

val pythonSuffix = s"""py${pythonVersion.split('.').take(2).mkString(".")}"""

val pythonCMD = s"""python${pythonVersion.split('.').head}"""

version := s"${packageVersion}-${sparkSuffix}-${pythonSuffix}"
version := s"${packageVersion}-${sparkSuffix}"

scalaVersion := "2.11.12"
scalaVersion := "2.12.11"

crossScalaVersions := Seq("2.11.12") // scala 2.12 when spark supports it
crossScalaVersions := Seq("2.12.11") // scala 2.12.11 when spark supports it

pomIncludeRepository := { _ => false }

Expand Down Expand Up @@ -92,46 +87,15 @@ licenses += ("Apache-2.0", url("http://opensource.org/licenses/Apache-2.0"))

scalacOptions ++= Seq("-unchecked", "-deprecation", "-feature")

lazy val deletePYC = taskKey[Unit]("Delete .pyc files")

deletePYC := {
val s: TaskStreams = streams.value
s.log.info("delete .pyc files...")
val cmd = "bash" :: "-c" :: "rm -f $(find python -name *.pyc)" :: Nil
val stat = (cmd !)
if (stat == 0) {
s.log.info("delete .pyc succeeded")
} else {
throw new IllegalStateException("delete .pyc failed")
}
}

lazy val compilePython = taskKey[Unit]("Compile python files")

compilePython := {
val s: TaskStreams = streams.value
s.log.info("compiling python...")
val stat = (Seq(pythonCMD, "-m", "compileall", "python/") !)
if (stat == 0) {
s.log.info("python compile succeeded")
} else {
throw new IllegalStateException("python compile failed")
}
}

compilePython := (compilePython.dependsOn(deletePYC)).value

(packageBin in Compile) := ((packageBin in Compile).dependsOn(compilePython)).value

mappings in (Compile, packageBin) ++= Seq(
(baseDirectory.value / "python" / "isarnproject" / "__init__.pyc") -> "isarnproject/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.pyc") -> "isarnproject/pipelines/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.pyc") -> "isarnproject/pipelines/fi.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.pyc") -> "isarnproject/sketches/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.pyc") -> "isarnproject/sketches/udaf/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.pyc") -> "isarnproject/sketches/udaf/tdigest.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.pyc") -> "isarnproject/sketches/udt/__init__.pyc",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.pyc") -> "isarnproject/sketches/udt/tdigest.pyc"
(baseDirectory.value / "python" / "isarnproject" / "__init__.py") -> "isarnproject/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "__init__.py") -> "isarnproject/pipelines/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "pipelines" / "fi.py") -> "isarnproject/pipelines/fi.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "__init__.py") -> "isarnproject/sketches/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "__init__.py") -> "isarnproject/sketches/udaf/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udaf" / "tdigest.py") -> "isarnproject/sketches/udaf/tdigest.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "__init__.py") -> "isarnproject/sketches/udt/__init__.py",
(baseDirectory.value / "python" / "isarnproject" / "sketches" / "udt" / "tdigest.py") -> "isarnproject/sketches/udt/tdigest.py"
)

test in assembly := {}
Expand Down
2 changes: 1 addition & 1 deletion project/build.properties
Original file line number Diff line number Diff line change
@@ -1 +1 @@
sbt.version=1.2.0
sbt.version=1.3.12
8 changes: 4 additions & 4 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@ resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositori

resolvers += "jgit-repo" at "http://download.eclipse.org/jgit/maven"

addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.2")
addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3")

addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.1")
addSbtPlugin("com.eed3si9n" % "sbt-unidoc" % "0.4.3")

addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.1.6")
addSbtPlugin("io.crashbox" % "sbt-gpg" % "0.2.1")

addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "2.3")
addSbtPlugin("org.xerial.sbt" % "sbt-sonatype" % "3.9.2")

// scoverage and coveralls deps are at old versions to avoid a bug in the current versions
// update these when this fix is released: https://github.com/scoverage/sbt-coveralls/issues/73
Expand Down

0 comments on commit b400522

Please sign in to comment.