Skip to content

Commit

Permalink
Add support for Java 17 (#346)
Browse files Browse the repository at this point in the history
* Update Spark to 3.3.3

* Update tests

* Update .gitignore

* Fix tests

* Update dependencies in the template

* Update dependencies in
  • Loading branch information
skylee03 committed Oct 28, 2023
1 parent b616f02 commit 6a54e54
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 35 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ spark-warehouses/

*.DS_Store*
.clj-kondo/.cache
.clj-kondo/marick

pom.xml
pom.xml.asc
Expand Down
21 changes: 15 additions & 6 deletions lein-template/resources/leiningen/new/geni/project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,16 @@
[metosin/jsonista "0.3.3"
:exclusions [com.fasterxml.jackson.core/jackson-databind]]
[expound "0.8.9"]
[io.netty/netty-all "4.1.74.Final"]
[com.fasterxml.jackson.core/jackson-core "2.15.3"]
[com.fasterxml.jackson.core/jackson-annotations "2.15.3"]
;; Spark
[org.apache.spark/spark-core_2.12 "3.1.2"]
[org.apache.spark/spark-hive_2.12 "3.1.2"]
[org.apache.spark/spark-mllib_2.12 "3.1.2"]
[org.apache.spark/spark-sql_2.12 "3.1.2"]
[org.apache.spark/spark-streaming_2.12 "3.1.2"]
[org.apache.spark/spark-yarn_2.12 "3.1.2"]
[org.apache.spark/spark-core_2.12 "3.3.3"]
[org.apache.spark/spark-hive_2.12 "3.3.3"]
[org.apache.spark/spark-mllib_2.12 "3.3.3"]
[org.apache.spark/spark-sql_2.12 "3.3.3"]
[org.apache.spark/spark-streaming_2.12 "3.3.3"]
[org.apache.spark/spark-yarn_2.12 "3.3.3"]
[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
; Arrow
[org.apache.arrow/arrow-memory-netty "4.0.0"]
Expand All @@ -40,6 +43,12 @@
"--class"
"{{namespace}}.core"
"target/uberjar/{{raw-name}}-standalone.jar"]]}{{/dataproc?}}
:jvm-opts ["--add-opens=java.base/java.io=ALL-UNNAMED"
"--add-opens=java.base/java.nio=ALL-UNNAMED"
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED"
"--add-opens=java.base/java.util=ALL-UNNAMED"
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"]
:profiles {:uberjar {:aot :all}
:dev {:plugins [[lein-ancient "0.7.0"]]}}
:main ^:skip-aot {{namespace}}.core
Expand Down
35 changes: 22 additions & 13 deletions project.clj
Original file line number Diff line number Diff line change
@@ -1,29 +1,38 @@
(def spark-deps
'[;; Spark
'[[io.netty/netty-all "4.1.74.Final"]
[com.fasterxml.jackson.core/jackson-core "2.15.3"]
[com.fasterxml.jackson.core/jackson-annotations "2.15.3"]
;; Spark
; This breaks cljcdoc: https://github.com/cljdoc/cljdoc/issues/407
; Frozen until issue is resolved.
;[com.github.fommil.netlib/all "1.1.2" :extension "pom"]
[org.apache.spark/spark-avro_2.12 "3.1.1"]
[org.apache.spark/spark-core_2.12 "3.1.1"]
[org.apache.spark/spark-hive_2.12 "3.1.1"]
[org.apache.spark/spark-mllib_2.12 "3.1.1"]
[org.apache.spark/spark-sql_2.12 "3.1.1"]
[org.apache.spark/spark-streaming_2.12 "3.1.1"]
[org.apache.spark/spark-avro_2.12 "3.3.3"]
[org.apache.spark/spark-core_2.12 "3.3.3"]
[org.apache.spark/spark-hive_2.12 "3.3.3"]
[org.apache.spark/spark-mllib_2.12 "3.3.3"]
[org.apache.spark/spark-sql_2.12 "3.3.3"]
[org.apache.spark/spark-streaming_2.12 "3.3.3"]
; Arrow
[org.apache.arrow/arrow-memory-netty "3.0.0"]
[org.apache.arrow/arrow-memory-core "3.0.0"]
[org.apache.arrow/arrow-vector "3.0.0"
[org.apache.arrow/arrow-memory-netty "4.0.0"]
[org.apache.arrow/arrow-memory-core "4.0.0"]
[org.apache.arrow/arrow-vector "4.0.0"
:exclusions [commons-codec com.fasterxml.jackson.core/jackson-databind]]
; Databases
[mysql/mysql-connector-java "8.0.23"]
[org.postgresql/postgresql "42.2.19"]
[mysql/mysql-connector-java "8.0.25"]
[org.postgresql/postgresql "42.2.20"]
[org.xerial/sqlite-jdbc "3.34.0"]
;; Optional: Spark XGBoost
[ml.dmlc/xgboost4j-spark_2.12 "1.2.0"]
[ml.dmlc/xgboost4j_2.12 "1.2.0"]])

(defproject zero.one/geni "0.0.40"
:jvm-opts ["-Duser.country=US" "-Duser.language=en"]
:jvm-opts ["-Duser.country=US" "-Duser.language=en"
"--add-opens=java.base/java.io=ALL-UNNAMED"
"--add-opens=java.base/java.nio=ALL-UNNAMED"
"--add-opens=java.base/java.lang.invoke=ALL-UNNAMED"
"--add-opens=java.base/java.util=ALL-UNNAMED"
"--add-opens=java.base/sun.nio.ch=ALL-UNNAMED"
"--add-opens=java.base/sun.util.calendar=ALL-UNNAMED"]
:description "A Clojure dataframe library that runs on Spark"
:url "https://github.com/zero-one-group/geni"
:license {:name "Apache License"
Expand Down
6 changes: 3 additions & 3 deletions test/zero_one/geni/data_sources_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
(g/dtypes dummy-df) => {:coord "ArrayType(DoubleType,true)"
:prop "MapType(StringType,StringType,true)"
:rooms (str "StructType("
"StructField(rooms,LongType,true), "
"StructField(rooms,LongType,true),"
"StructField(bathroom,DoubleType,true))")})
(fact "correct direct schema option"
(-> (g/read-parquet!
Expand All @@ -46,7 +46,7 @@
g/dtypes) => {:coord "ArrayType(LongType,true)"
:prop "MapType(StringType,StringType,true)"
:rooms (str "StructType("
"StructField(rooms,IntegerType,true), "
"StructField(rooms,IntegerType,true),"
"StructField(bathroom,FloatType,true))")})
(fact "correct data-oriented schema option"
(-> (g/read-parquet!
Expand All @@ -57,7 +57,7 @@
g/dtypes) => {:coord "ArrayType(ShortType,true)"
:prop "MapType(StringType,StringType,true)"
:rooms (str "StructType("
"StructField(rooms,FloatType,true), "
"StructField(rooms,FloatType,true),"
"StructField(bathroom,LongType,true))")})))

(facts "On binary data" :binary
Expand Down
10 changes: 5 additions & 5 deletions test/zero_one/geni/dataset_creation_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -51,15 +51,15 @@
[(g/row (g/row 27 42))
(g/row (g/row 57 18))]
{:coord {:x :int :y :int}}))
=> {:coord "StructType(StructField(x,IntegerType,true), StructField(y,IntegerType,true))"})
=> {:coord "StructType(StructField(x,IntegerType,true),StructField(y,IntegerType,true))"})
(fact "of struct array fields"
(g/dtypes
(g/create-dataframe
@tr/spark
[(g/row [(g/row 27 42)])
(g/row [(g/row 57 18)])]
{:coords [{:x :int :y :int}]}))
=> {:coords "ArrayType(StructType(StructField(x,IntegerType,true), StructField(y,IntegerType,true)),true)"}))
=> {:coords "ArrayType(StructType(StructField(x,IntegerType,true),StructField(y,IntegerType,true)),true)"}))

(facts "On building blocks"
(fact "can instantiate vectors"
Expand Down Expand Up @@ -266,7 +266,7 @@
(instance? Dataset dataset) => true
(g/column-names dataset) => ["a" "b"]
(g/dtypes dataset) => {:a "LongType"
:b "StructType(StructField(z,ArrayType(StringType,true),true), StructField(y,BooleanType,true))"}))
:b "StructType(StructField(z,ArrayType(StringType,true),true),StructField(y,BooleanType,true))"}))
(fact "should create the right schema for list of maps"
(let [dataset (g/table->dataset
@tr/spark
Expand All @@ -276,7 +276,7 @@
(instance? Dataset dataset) => true
(g/column-names dataset) => ["a" "b"]
(g/dtypes dataset) => {:a "LongType"
:b "ArrayType(StructType(StructField(z,LongType,true), StructField(y,DoubleType,true)),true)"}))
:b "ArrayType(StructType(StructField(z,LongType,true),StructField(y,DoubleType,true)),true)"}))
(fact "should create the right schema for list of list of maps"
(let [dataset (g/table->dataset
@tr/spark
Expand All @@ -286,7 +286,7 @@
(instance? Dataset dataset) => true
(g/column-names dataset) => ["a" "b"]
(g/dtypes dataset) => {:a "LongType"
:b "ArrayType(ArrayType(StructType(StructField(z,LongType,true), StructField(y,BooleanType,true)),true),true)"})))
:b "ArrayType(ArrayType(StructType(StructField(z,LongType,true),StructField(y,BooleanType,true)),true),true)"})))

(facts "On spark range"
(fact "should create simple datasets"
Expand Down
2 changes: 1 addition & 1 deletion test/zero_one/geni/dataset_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@
(-> (df-20)
(g/repartition :Suburb :SellerG)
g/partitions
count) => #(< 1 %))
count) => #(<= 1 %))
(fact "able to repartition by number and columns"
(-> (df-20)
(g/repartition 10 :Suburb :SellerG)
Expand Down
2 changes: 1 addition & 1 deletion test/zero_one/geni/rdd_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
(rdd/resources) => {}
(rdd/spark-home) => (System/getenv "SPARK_HOME")
(rdd/sc) => (partial instance? SparkContext)
(rdd/version) => "3.1.1"))
(rdd/version) => "3.3.3"))

(facts "On repartitioning" :rdd
(fact "partition-by works"
Expand Down
12 changes: 6 additions & 6 deletions test/zero_one/geni/sql_functions_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
:to-2 (g/to-json (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")})
{:timestampFormat "dd/MM/yyyy"})})
g/collect
first) => {:schema-1 "ARRAY<STRUCT<`col`: BIGINT>>"
:schema-2 "ARRAY<STRUCT<`col`: BIGINT>>"
first) => {:schema-1 "ARRAY<STRUCT<col: BIGINT>>"
:schema-2 "ARRAY<STRUCT<col: BIGINT>>"
:from-1 {:a 1 :b 0.8}
:from-2 {:time (Timestamp. 1440547200000)}
:to-1 "{\"a\":1,\"b\":2}"
Expand All @@ -44,8 +44,8 @@
:to-2 (g/to-csv (g/struct {:time (g/to-timestamp (g/lit "2015-08-26") "yyyy-MM-dd")})
{:timestampFormat "dd/MM/yyyy"})})
g/collect
first) => {:schema-1 "STRUCT<`_c0`: INT, `_c1`: STRING>"
:schema-2 "STRUCT<`_c0`: INT, `_c1`: STRING>"
first) => {:schema-1 "STRUCT<_c0: INT, _c1: STRING>"
:schema-2 "STRUCT<_c0: INT, _c1: STRING>"
:from-1 {:a 1 :b 0.8}
:from-2 {:time (Timestamp. 1440547200000)}
:to-1 "1,2"
Expand Down Expand Up @@ -214,7 +214,7 @@
(-> (df-20)
(g/cube :SellerG :Regionname)
(g/agg (g/grouping-id :SellerG :Regionname))
g/first-vals) => ["Nelson" nil 1]
g/first-vals) => ["Biggin" "Northern Metropolitan" 0]
(-> (df-20)
(g/group-by :SellerG)
(g/agg (-> (g/collect-list :Regionname) (g/as :regions)))
Expand Down Expand Up @@ -503,7 +503,7 @@
(g/agg
(g/count-distinct {:seller :SellerG
:suburb :Suburb}))
g/column-names) => ["count(SellerG AS `seller`, Suburb AS `suburb`)"])))
g/column-names) => ["count(SellerG AS seller, Suburb AS suburb)"])))

(facts "On window functions" :slow
(let [window (g/window {:partition-by :SellerG :order-by :Price})]
Expand Down

0 comments on commit 6a54e54

Please sign in to comment.