Use konserve synchronous IO. Support native-image compilation.

replikativ · Oct 20, 2022 · 4f4bdf1 · 4f4bdf1
1 parent b8e4322
commit 4f4bdf1
Show file tree

Hide file tree

Showing 13 changed files with 453 additions and 75 deletions.
diff --git a/bin/build-native-image b/bin/build-native-image
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+function on_path {
+    builtin type -P "$1" &> /dev/null
+}
+
+GRAAL_NOT_ON_PATH="PATH does not contain native-image. Make sure to add your GraalVM to it."
+on_path native-image && clojure -M:native-image --no-fallback  --report-unsupported-elements-at-runtime || echo $GRAAL_NOT_ON_PATH
diff --git a/bin/run-native-image-tests b/bin/run-native-image-tests
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+TMPSTORE=/tmp/dh-test-store
+
+datahike benchmark db:testconfig.edn 0 100000 10000
+datahike transact db:testconfig.edn '[[:db/add -1 :name "Judea"]]'
+QUERY_OUT=`datahike query '[:find (count ?e) . :where [?e :name _]]' db:testconfig.edn`
+
+if [ $QUERY_OUT -eq 100001 ]
+then
+  echo "Test successful."
+else
+  echo "Query did not return correct value."
+fi
+
+rm -rf $TMPSTORE
diff --git a/bin/testconfig.edn b/bin/testconfig.edn
@@ -0,0 +1,5 @@
+{:store  {:backend :file
+          :path "/tmp/dh-test-store"
+          :config {:in-place? true}}
+ :keep-history? true
+ :schema-flexibility :read}
diff --git a/deps.edn b/deps.edn
@@ -1,9 +1,11 @@
 {:deps {org.clojure/clojure                         {:mvn/version "1.11.1"}
         org.clojure/clojurescript                   {:mvn/version "1.11.4"}
         io.replikativ/hasch                         {:mvn/version "0.3.7"}
-        io.replikativ/hitchhiker-tree               {:mvn/version "0.1.11"}
-        io.replikativ/incognito                     {:mvn/version "0.3.66"}
-        io.replikativ/konserve                      {:mvn/version "0.6.0-alpha3"}
+        io.replikativ/hitchhiker-tree               {;; :mvn/version "0.1.11"
+                                                     :git/url "https://github.com/replikativ/hitchhiker-tree.git"
+                                                     :sha     "35c8a8040d37dd98845023cff427f2e961ab745c"}
+        io.replikativ/konserve                      {:git/url "https://github.com/replikativ/konserve.git"
+                                                     :sha     "5e8416fc12fe8fc05384ff17fb641538645671db"}
         persistent-sorted-set/persistent-sorted-set {:mvn/version "0.1.4"}
         environ/environ                             {:mvn/version "1.2.0"}
         com.taoensso/timbre                         {:mvn/version "5.2.1"}
@@ -71,12 +73,20 @@
 
            :ffix {:extra-deps {cljfmt/cljfmt {:mvn/version "0.8.0"}}
                   :main-opts ["-m" "cljfmt.main" "fix"]}
-
            :build {:deps {io.github.clojure/tools.build {:git/tag "v0.8.1" :git/sha "7d40500"}
                           slipset/deps-deploy {:mvn/version "0.2.0"}
                           borkdude/gh-release-artifact {:git/url "https://github.com/borkdude/gh-release-artifact"
                                                         :sha "a83ee8da47d56a80b6380cbb6b4b9274048067bd"}
                           babashka/babashka.curl {:mvn/version "0.1.2"}
                           babashka/fs {:mvn/version "0.1.4"}
                           cheshire/cheshire {:mvn/version "5.10.2"}}
-                   :ns-default build}}}
+                   :ns-default build}
+           :native-image {:main-opts ["-m" "clj.native-image" "datahike.cli"
+                                      "--initialize-at-build-time"
+                                      ;; optional native image name override
+                                      "-H:Name=datahike"]
+                          :jvm-opts  ["-Dclojure.compiler.direct-linking=true"]
+                          :extra-deps
+                          {clj.native-image/clj.native-image
+                           {:git/url "https://github.com/taylorwood/clj.native-image.git"
+                            :sha     "7708e7fd4572459c81f6a6b8e44c96f41cdd92d4"}}}}}
diff --git a/doc/cli.md b/doc/cli.md
@@ -0,0 +1,128 @@
+# Command line interface
+
+*This is work in progress and subject to change.*
+
+We provide the `datahike` native executable to access Datahike databases from
+the command line. 
+
+
+# Example usage
+
+First you need to download the precompiled binary, or build it yourself, and put
+it on your executable path.
+
+To access a database you need to provide the usual configuration for Datahike.
+Put this into a file `myconfig.edn`.
+
+```clojure
+{:store  {:backend :file
+          :path "/home/USERNAME/dh-shared-db"
+          :config {:in-place? true}}
+ :keep-history? true
+ :schema-flexibility :read}
+```
+
+Now you can invoke some of our core API functions on the database. Let us add a
+fact to the database (be careful to use single ' if you do not want your shell
+to substitute parts of your Datalog ;) ):
+
+```bash
+$ datahike transact db:myconfig.edn '[[:db/add -1 :name "Linus"]]'
+ ```
+
+And retrieve it:
+
+```bash 
+$ datahike query '[:find ?n . :where [?e :name ?n]]' db:myconfig.edn 
+"Linus" # prints the name
+```
+
+By prefixing the path with `db:` to the query engine you can pass multiple db
+configuration files and join over arbitrary many databases. Everything else is
+read in as `edn` and passed to the query engine as well.
+
+
+Provided the filestore is configured with `{:in-place? true}` you can even write
+to the same database without a dedicated daemon from different shells,
+
+
+```bash
+$ datahike benchmark db:myconfig.edn 0 50000 100
+"Elapsed time: 116335.589411 msecs"
+```
+
+Here we use a provided benchmark helper which transacts facts of the form `[eid
+:name (random-team-member)]` for `eid=0,...,50000` into the store. `100` denotes
+the batch size for each transaction, so here we chunk the 50000 facts into 500
+transactions.
+
+In a second shell you can now simultaneously add facts in a different range
+
+```bash
+$ datahike benchmark db:myconfig.edn 50000 100000 100
+```
+
+
+To check that everything has been added and no write operations have overwritten
+each other.
+
+
+```bash
+$ datahike query '[:find (count ?e) . :in $ :where [?e :name ?n]]' db:myconfig.edn
+100000 # check :)
+```
+
+# Memory model
+
+The persistent semantics of Datahike work more like `git` and less like similar
+mutable databases such as SQLite or Datalevin. In particular you can always read
+and retain snapshots (copies) of the database for free, no matter what else is
+happening in the system. The current version is tested with memory and file
+storage, but hopefully many other backends will also work with the
+`native-image`.
+
+In principle this shared memory access should even work while having a JVM
+server, e.g. datahike-server, serving the same database. Note that all reads can
+happen in parallel, only the writers experience congestion around exclusive file
+locks here. This access pattern does not provide highest throughput, but is
+extremely flexible and easy to start with.
+
+## Forking and pulling
+
+Forking is easy, it is enough to copy the folder of the store (even if the
+database is currently being written to). The only thing you need to take care of
+is to copy the DB root first and place it into the target directory last, it is
+the file `0594e3b6-9635-5c99-8142-412accf3023b.ksv`. Then you can use e.g.
+`rsync` (or `git`) to copy all other (immutable) files into your new folder. In
+the end you copy the root file in there as well, making sure that all files it
+is referencing are reachable. Note that this will ensure that you only copy new
+data each time.
+
+## Merging
+
+Now here comes the cool part. You do not need anything more for merging than
+Datalog itself. You can use a query like this to extract all new facts that are
+in `db1` but not in `db2` like this:
+
+```bash
+datahike query '[:find ?e ?a ?v ?t :in $ $2 :where [$ ?e ?a ?v ?t] (not [$2 ?e ?a ?v ?t])]' db:config1.edn db:config2.edn
+```
+
+Since we cannot update transaction metadata, we should filter out
+`:db/txInstant`s. We can also use a trick to add `:db/add` to each element in
+the results, yielding valid transactions that we can then feed into `db2`.
+
+
+```bash
+datahike query '[:find ?db-add ?e ?a ?v ?t :in $ $2 ?db-add :where [$ ?e ?a ?v ?t] [(not= :db/txInstant ?a)] (not [$2 ?e ?a ?v ?t])]' db:config1.edn db:config2.edn ":db/add" | transact db:config2.edn
+```
+
+Note that this very simple strategy assumes that the entity ids that have been
+added to `db1` do not overlap with potentially new ones added to `db2`. You can
+encode conflict resolution strategies and id mappings with Datalog as well and
+we are exploring several such strategies at the moment. This strategy is fairly
+universal, as [CRDTs can be expressed in pure
+Datalog](https://speakerdeck.com/ept/data-structures-as-queries-expressing-crdts-using-datalog).
+While it is not the most efficient way to merge, we plan to provide fast paths
+for common patterns in Datalog. Feel free to contact us if you are interested in
+complex merging strategies or have related cool ideas.
diff --git a/java/src/datahike/java/DatahikeTest.java b/java/src/datahike/java/DatahikeTest.java
@@ -129,6 +129,8 @@ public void history() {
     public void asOfAndSince() {
         transactOnce();
 
+        Thread.sleep(10);
+
         firstDate = new Date();
         Datahike.transact(conn, vec(map(
                 k(":db/id"), vec(k(":name"), "Alice"),