Skip to content

Commit

Permalink
Populate top level docs (#262)
Browse files Browse the repository at this point in the history
* Added more top-level docs + more URLs to scrape

* Fixed missing import-fn for parallelise-pairs

* Updated dockers project.clj

* Minor tweak to project.clj formatting
  • Loading branch information
anthony-khong authored Oct 13, 2020
1 parent f30c6d3 commit 0c37070
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 31 deletions.
10 changes: 9 additions & 1 deletion docker/project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,15 @@
[midje "1.9.9"]]
:plugins [[lein-ancient "0.6.15"]
[lein-cloverage "1.2.1"]
[lein-midje "3.2.2"]]
[lein-midje "3.2.2"]
[lein-cljfmt "0.7.0"]]
:cljfmt {:split-keypairs-over-multiple-lines? false
:remove-multiple-non-indenting-spaces? false
;; Note: we add custom rules to handle code from midje test library
;; See https://github.com/weavejester/cljfmt/blob/master/cljfmt/resources/cljfmt/indents/clojure.clj
;; for more control
:indents {facts [[:inner 0] [:block 1]]
fact [[:inner 0] [:block 1]]}}
:aot [zero-one.geni.rdd.function
zero-one.geni.aot-functions]}}
:repl-options {:init-ns zero-one.geni.main}
Expand Down
3 changes: 2 additions & 1 deletion project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,8 @@
:cljfmt {:split-keypairs-over-multiple-lines? false
:remove-multiple-non-indenting-spaces? false
;; Note: we add custom rules to handle code from midje test library
;; See: https://github.com/weavejester/cljfmt/blob/master/cljfmt/resources/cljfmt/indents/clojure.clj ;; for more control
;; See https://github.com/weavejester/cljfmt/blob/master/cljfmt/resources/cljfmt/indents/clojure.clj
;; for more control
:indents {facts [[:inner 0] [:block 1]]
fact [[:inner 0] [:block 1]]}}
:aot [zero-one.geni.rdd.function
Expand Down
3 changes: 3 additions & 0 deletions scripts/scrape-spark-docs.clj
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@
:functions "sql/functions$.html"
:grouped "sql/RelationalGroupedDataset.html"
:na-fns "sql/DataFrameNaFunctions.html"
:row "sql/Row$.html"
:stat-fns "sql/DataFrameStatFunctions.html"
:window "sql/expressions/Window$.html"}
:hash-partitioner "HashPartitioner.html"
Expand All @@ -183,6 +184,8 @@
:string-indexer "ml/feature/StringIndexerModel.html"
:vector-indexer "ml/feature/VectorIndexerModel.html"
:vector-size-hint "ml/feature/VectorSizeHint.html"}
:linalg
{:vectors "ml/linalg/Vectors$.html"}
:models
{:als "ml/recommendation/ALSModel.html"
:classification "ml/classification/ClassificationModel.html"
Expand Down
9 changes: 9 additions & 0 deletions src/clojure/zero_one/geni/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,11 @@
conf
default-min-partitions
default-parallelism
get-checkpoint-dir
get-conf
get-local-property
get-persistent-rdds
get-spark-home
jars
java-spark-context
master
Expand Down Expand Up @@ -626,3 +631,7 @@

(def to-debug-string (memfn toDebugString))
(def ->debug-string to-debug-string)

(comment
(require '[zero-one.geni.docs :as docs])
(docs/docless-vars *ns*))
16 changes: 14 additions & 2 deletions src/clojure/zero_one/geni/interop.clj
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
[clojure.java.data :as j]
[clojure.string :refer [replace-first]]
[clojure.walk :as walk]
[zero-one.geni.docs :as docs]
[zero-one.geni.utils :refer [ensure-coll]])
(:import
(java.io ByteArrayOutputStream)
Expand Down Expand Up @@ -98,6 +99,7 @@

(defn ->sparse-vector [size indices values]
(SparseVector. size (int-array indices) (double-array values)))
(def sparse ->sparse-vector)

(defn array? [value] (.isArray (class value)))

Expand Down Expand Up @@ -210,7 +212,17 @@
(let [flattened (mapcat ensure-coll values)]
(->dense-vector flattened)))

(def sparse ->sparse-vector)

(defn row [& values]
(->spark-row values))

(docs/add-doc!
(var dense)
(-> docs/spark-docs :methods :ml :linalg :vectors :dense))

(docs/add-doc!
(var sparse)
(-> docs/spark-docs :methods :ml :linalg :vectors :sparse))

(docs/add-doc!
(var row)
(-> docs/spark-docs :methods :core :row :from-seq))
73 changes: 46 additions & 27 deletions src/clojure/zero_one/geni/spark_context.clj
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
(ns zero-one.geni.spark-context
(:require
[potemkin :refer [import-fn]]
[zero-one.geni.defaults :as defaults]
[zero-one.geni.docs :as docs]
[zero-one.geni.interop :as interop]
[zero-one.geni.rdd.unmangle :as unmangle])
(:import
(org.apache.spark.api.java JavaSparkContext)
(org.apache.spark.sql SparkSession)))

(defn java-spark-context [spark]
(defn java-spark-context
"Converts a SparkSession to a JavaSparkContext."
[spark]
(JavaSparkContext/fromSparkContext (.sparkContext spark)))

(defn app-name
Expand All @@ -27,13 +31,13 @@
([value] (broadcast @defaults/spark value))
([spark value] (-> spark java-spark-context (.broadcast value))))

(defn checkpoint-dir
([] (checkpoint-dir @defaults/spark))
(defn get-checkpoint-dir
([] (get-checkpoint-dir @defaults/spark))
([spark]
(-> spark java-spark-context .getCheckpointDir interop/optional->nillable)))

(defn conf
([] (conf @defaults/spark))
(defn get-conf
([] (get-conf @defaults/spark))
([spark] (-> spark java-spark-context .getConf interop/spark-conf->map)))

(defn default-min-partitions
Expand All @@ -52,61 +56,56 @@
([] (jars @defaults/spark))
([spark] (->> spark java-spark-context .jars (into []))))

(defn local?
([] (local? @defaults/spark))
(defn is-local
([] (is-local @defaults/spark))
([spark] (-> spark java-spark-context .isLocal)))
(def is-local local?)

(defn local-property
([k] (local-property @defaults/spark k))
(defn get-local-property
([k] (get-local-property @defaults/spark k))
([spark k] (-> spark java-spark-context (.getLocalProperty k))))

(defn master
([] (master @defaults/spark))
([spark] (-> spark java-spark-context .master)))

;; TODO: support min-partitions arg
(defn parallelise
([data] (parallelise @defaults/spark data))
(defn parallelize
([data] (parallelize @defaults/spark data))
([spark data] (-> spark
java-spark-context
(.parallelize data)
unmangle/unmangle-name)))
(def parallelize parallelise)

(defn parallelise-doubles
([data] (parallelise-doubles @defaults/spark data))
(defn parallelize-doubles
([data] (parallelize-doubles @defaults/spark data))
([spark data]
(-> spark
java-spark-context
(.parallelizeDoubles (clojure.core/map double data))
unmangle/unmangle-name)))
(def parallelize-doubles parallelise-doubles)

(defn parallelise-pairs
([data] (parallelise-pairs @defaults/spark data))
(defn parallelize-pairs
([data] (parallelize-pairs @defaults/spark data))
([spark data]
(-> spark
java-spark-context
(.parallelizePairs (clojure.core/map interop/->scala-tuple2 data))
unmangle/unmangle-name)))
(def parallelize-pairs parallelise-pairs)

(defn persistent-rdds
([] (persistent-rdds @defaults/spark))
(defn get-persistent-rd-ds
([] (get-persistent-rd-ds @defaults/spark))
([spark] (->> spark java-spark-context .getPersistentRDDs (into {}))))

(defn resources
([] (resources @defaults/spark))
([spark] (->> spark java-spark-context .resources (into {}))))

(defn spark-context
([] (spark-context @defaults/spark))
(defn sc
([] (sc @defaults/spark))
([spark] (-> spark java-spark-context .sc)))
(def sc spark-context)

(defn spark-home
([] (spark-home @defaults/spark))
(defn get-spark-home
([] (get-spark-home @defaults/spark))
([spark] (-> spark java-spark-context .getSparkHome interop/optional->nillable)))

(defmulti text-file (fn [head & _] (class head)))
Expand All @@ -132,4 +131,24 @@
(.wholeTextFiles (java-spark-context spark) path min-partitions)))

;; Broadcast
(def value (memfn value))
(def value
"memfn of value"
(memfn value))

;; Docs
(docs/alter-docs-in-ns!
'zero-one.geni.spark-context
[(-> docs/spark-docs :methods :spark :context)])

;; Aliases
(import-fn get-checkpoint-dir checkpoint-dir)
(import-fn get-conf conf)
(import-fn get-local-property local-property)
(import-fn get-persistent-rd-ds get-persistent-rdds)
(import-fn get-persistent-rd-ds persistent-rdds)
(import-fn get-spark-home spark-home)
(import-fn is-local local?)
(import-fn parallelize parallelise)
(import-fn parallelize-doubles parallelise-doubles)
(import-fn parallelize-pairs parallelise-pairs)
(import-fn sc spark-context)

0 comments on commit 0c37070

Please sign in to comment.