diff --git a/nix/flake/checks/configuration-test.nix b/nix/flake/checks/configuration-test.nix new file mode 100644 index 0000000000..6de5790397 --- /dev/null +++ b/nix/flake/checks/configuration-test.nix @@ -0,0 +1,85 @@ +{ config, pkgs, ... }: +let + full-text-search = { + enabled = true; + solr.url = "http://localhost:${toString config.services.solr.port}/solr/docspell"; + postgresql = { + pg-config = { + "german" = "my-germam"; + }; + }; + }; +in +{ + + i18n = { + defaultLocale = "en_US.UTF-8"; + }; + console.keyMap = "de"; + + users.users.root = { + password = "root"; + }; + + + services.docspell-joex = { + enable = true; + waitForTarget = "solr-init.target"; + bind.address = "0.0.0.0"; + base-url = "http://localhost:7878"; + jvmArgs = [ "-J-Xmx1536M" ]; + inherit full-text-search; + }; + services.docspell-restserver = { + enable = true; + bind.address = "0.0.0.0"; + backend = { + addons.enabled = true; + }; + integration-endpoint = { + enabled = true; + http-header = { + enabled = true; + header-value = "test123"; + }; + }; + openid = [ + { + enabled = true; + display = "Local"; + provider = { + provider-id = "local"; + client-id = "cid1"; + client-secret = "csecret-1"; + authorize-url = "http://auth"; + token-url = "http://token"; + sign-key = "b64:uiaeuae"; + }; + } + ]; + inherit full-text-search; + }; + + environment.systemPackages = + [ + pkgs.docspell.server + pkgs.docspell.joex + pkgs.jq + pkgs.telnet + pkgs.htop + pkgs.openjdk + ]; + + + services.xserver = { + enable = false; + }; + + networking = { + hostName = "docspelltest"; + firewall.allowedTCPPorts = [ 7880 ]; + }; + + system.stateVersion = "21.05"; + +} diff --git a/nix/flake/checks/default.nix b/nix/flake/checks/default.nix new file mode 100644 index 0000000000..35d436990f --- /dev/null +++ b/nix/flake/checks/default.nix @@ -0,0 +1,7 @@ +{ ... }: +{ + imports = [ + ./configuration-test.nix + ./solr.nix + ]; +} diff --git a/nix/flake/checks/solr.nix b/nix/flake/checks/solr.nix new file mode 100644 index 0000000000..b7d73cfbd7 --- /dev/null +++ b/nix/flake/checks/solr.nix @@ -0,0 +1,35 @@ +{ config, pkgs, ... }: + +# This module sets up solr with one core. It is a bit tedious…. If you +# know a better solution, please let me know. +{ + + services.solr = { + enable = true; + }; + # This is needed to run solr script as user solr + users.users.solr.useDefaultShell = true; + + systemd.services.solr-init = + let + solrPort = toString config.services.solr.port; + initSolr = '' + if [ ! -f ${config.services.solr.stateDir}/docspell_core ]; then + while ! echo "" | ${pkgs.telnet}/bin/telnet localhost ${solrPort} + do + echo "Waiting for SOLR become ready..." + sleep 1.5 + done + ${pkgs.su}/bin/su -s ${pkgs.bash}/bin/sh solr -c "${pkgs.solr}/bin/solr create_core -c docspell -p ${solrPort}"; + touch ${config.services.solr.stateDir}/docspell_core + fi + ''; + in + { script = initSolr; + after = [ "solr.target" ]; + wantedBy = [ "multi-user.target" ]; + requires = [ "solr.target" ]; + description = "Create a core at solr"; + }; + +} diff --git a/nix/flake/checks/testScript.py b/nix/flake/checks/testScript.py new file mode 100644 index 0000000000..ae5fa1a721 --- /dev/null +++ b/nix/flake/checks/testScript.py @@ -0,0 +1,3 @@ +with subtest("services are up"): + machine.wait_for_unit("docspell-restserver") + machine.wait_for_unit("docspell-joex") diff --git a/nix/flake/flake.lock b/nix/flake/flake.lock new file mode 100644 index 0000000000..c32bbe2eaf --- /dev/null +++ b/nix/flake/flake.lock @@ -0,0 +1,25 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1669535121, + "narHash": "sha256-koZLM7oWVGrjyHnYDo7/w5qlmUn9UZUKSFNfmIjueE8=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "b45ec953794bb07922f0468152ad1ebaf8a084b3", + "type": "github" + }, + "original": { + "id": "nixpkgs", + "type": "indirect" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/nix/flake/flake.nix b/nix/flake/flake.nix new file mode 100644 index 0000000000..c9ae9b0248 --- /dev/null +++ b/nix/flake/flake.nix @@ -0,0 +1,127 @@ +{ + description = "Docspell flake"; + + outputs = { self, nixpkgs }: + let + supportedSystems = [ "x86_64-linux" "aarch64-linux" ]; + forAllSystems = nixpkgs.lib.genAttrs supportedSystems; + nixpkgsFor = forAllSystems (system: import nixpkgs { inherit system; }); + # Version config + cfg = { + v0_39_0 = rec { + version = "0.39.0"; + server = { + url = "https://github.com/eikek/docspell/releases/download/v${version}/docspell-restserver-${version}.zip"; + sha256 = "sha256-YZzYOqJzp2J5ioTT8H7qpRA3mHDRjJYNA7fUOEQWSfY="; + }; + joex = { + url = "https://github.com/eikek/docspell/releases/download/v${version}/docspell-joex-${version}.zip"; + sha256 = "sha256-6Vcuk9+JDkNAdTQd+sRLARfE+y9cbtGE8hWTTcxZk3E="; + }; + }; + }; + current_version = cfg.v0_39_0; + inherit (current_version) version; + in + rec + { + overlays.default = final: prev: { + docspell-server = with final; stdenv.mkDerivation { + inherit version; + pname = "docspell-server"; + + src = fetchzip current_version.server; + buildInputs = [ jdk11 ]; + buildPhase = "true"; + + installPhase = '' + mkdir -p $out/{bin,docspell-restserver-${version}} + cp -R * $out/docspell-restserver-${version}/ + cat > $out/bin/docspell-restserver <<-EOF + #!${bash}/bin/bash + $out/docspell-restserver-${version}/bin/docspell-restserver -java-home ${jdk11} "\$@" + EOF + chmod 755 $out/bin/docspell-restserver + ''; + }; + docspell-joex = with final; stdenv.mkDerivation rec { + inherit version; + + pname = "docspell-joex"; + + src = fetchzip current_version.joex; + + buildInputs = [ jdk11 ]; + + buildPhase = "true"; + + installPhase = '' + mkdir -p $out/{bin,docspell-joex-${version}} + cp -R * $out/docspell-joex-${version}/ + cat > $out/bin/docspell-joex <<-EOF + #!${bash}/bin/bash + $out/docspell-joex-${version}/bin/docspell-joex -java-home ${jdk11} "\$@" + EOF + chmod 755 $out/bin/docspell-joex + ''; + }; + + }; + + packages = forAllSystems (system: + { + default = (import nixpkgs { + inherit system; + overlays = [ self.overlays.default ]; + }).docspell-server; + }); + + checks = forAllSystems + (system: { + build = self.packages.${system}.default; + + test = + with import (nixpkgs + "/nixos/lib/testing-python.nix") + { + inherit system; + }; + + makeTest { + name = "docspell"; + nodes = { + client = { ... }: { + imports = [ + self.nixosModules.default + ./checks + ]; + }; + }; + + testScript = + '' + start_all() + client.wait_for_unit("docspell-restserver") + ''; + }; + }); + + nixosModules = { + default = { ... }: { + imports = [ + ((import ./modules/server.nix) self.overlays.default) + ((import ./modules/joex.nix) self.overlays.default) + ]; + }; + server = ((import ./modules/server.nix) self.overlays.default); + joex = ((import ./modules/joex.nix) self.overlays.default); + }; + + + # nixosModules = { + # # Default module imports both server and joex + # default = import ./modules; + # server = import ./modules/server.nix; + # joex = import ./modules/joex.nix; + # }; + }; +} diff --git a/nix/flake/modules/joex.nix b/nix/flake/modules/joex.nix new file mode 100644 index 0000000000..d1ff7824d6 --- /dev/null +++ b/nix/flake/modules/joex.nix @@ -0,0 +1,1728 @@ +overlay: { config, lib, pkgs, ... }: + +with lib; +let + cfg = config.services.docspell-joex; + user = if cfg.runAs == null then "docspell" else cfg.runAs; + configFile = pkgs.writeText "docspell-joex.conf" '' + {"docspell": { "joex": + ${builtins.toJSON cfg} + }} + ''; + defaults = { + app-id = "joex1"; + base-url = "http://localhost:7878"; + bind = { + address = "localhost"; + port = 7878; + }; + logging = { + minimum-level = "Info"; + format = "Fancy"; + levels = { + "docspell" = "Info"; + "org.flywaydb" = "Info"; + "binny" = "Info"; + "org.http4s" = "Info"; + }; + }; + mail-debug = false; + jdbc = { + url = "jdbc:h2:///tmp/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"; + user = "sa"; + password = ""; + }; + send-mail = { + list-id = ""; + }; + user-tasks = { + scan-mailbox = { + max-folders = 50; + mail-chunk-size = 50; + max-mails = 500; + }; + }; + scheduler = { + pool-size = 2; + counting-scheme = "4,1"; + retries = 2; + retry-delay = "1 minute"; + log-buffer-size = 500; + wakeup-period = "30 minutes"; + }; + periodic-scheduler = { + wakeup-period = "10 minutes"; + }; + house-keeping = { + schedule = "Sun *-*-* 00:00:00 UTC"; + cleanup-invites = { + enabled = true; + older-than = "30 days"; + }; + cleanup-jobs = { + enabled = true; + older-than = "30 days"; + delete-batch = 100; + }; + cleanup-remember-me = { + enabled = true; + older-than = "30 days"; + }; + cleanup-downloads = { + enabled = true; + older-than = "14 days"; + }; + check-nodes = { + enabled = true; + min-not-found = 2; + }; + }; + update-check = { + enabled = false; + test-run = false; + schedule = "Sun *-*-* 00:00:00 UTC"; + sender-account = ""; + smtp-id = ""; + recipients = [ ]; + subject = "Docspell {{ latestVersion }} is available"; + body = '' + Hello, + + You are currently running Docspell {{ currentVersion }}. Version *{{ latestVersion }}* + is now available, which was released on {{ releasedAt }}. Check the release page at: + + + + Have a nice day! + + Docpell Update Check + ''; + }; + extraction = { + pdf = { + min-text-len = 500; + }; + preview = { + dpi = 32; + }; + ocr = { + max-image-size = 14000000; + page-range = { + begin = 10; + }; + ghostscript = { + working-dir = "/tmp/docspell-extraction"; + command = { + program = "${pkgs.ghostscript}/bin/gs"; + args = [ "-dNOPAUSE" "-dBATCH" "-dSAFER" "-sDEVICE=tiffscaled8" "-sOutputFile={{outfile}}" "{{infile}}" ]; + timeout = "5 minutes"; + }; + }; + unpaper = { + command = { + program = "${pkgs.unpaper}/bin/unpaper"; + args = [ "{{infile}}" "{{outfile}}" ]; + timeout = "5 minutes"; + }; + }; + tesseract = { + command = { + program = "${pkgs.tesseract4}/bin/tesseract"; + args = [ "{{file}}" "stdout" "-l" "{{lang}}" ]; + timeout = "5 minutes"; + }; + }; + }; + }; + text-analysis = { + max-length = 5000; + nlp = { + mode = "full"; + clear-interval = "15 minutes"; + max-due-date-years = 10; + regex-ner = { + max-entries = 1000; + file-cache-time = "1 minute"; + }; + }; + classification = { + enabled = true; + item-count = 600; + classifiers = [ + { + "useSplitWords" = "true"; + "splitWordsTokenizerRegexp" = ''[\p{L}][\p{L}0-9]*|(?:\$ ?)?[0-9]+(?:\.[0-9]{2})?%?|\s+|.''; + "splitWordsIgnoreRegexp" = ''\s+''; + "useSplitPrefixSuffixNGrams" = "true"; + "maxNGramLeng" = "4"; + "minNGramLeng" = "1"; + "splitWordShape" = "chris4"; + "intern" = "true"; + } + ]; + }; + working-dir = "/tmp/docspell-analysis"; + }; + convert = { + chunk-size = 524288; + converted-filename-part = "converted"; + max-image-size = 14000000; + + markdown = { + internal-css = '' + body { padding: 2em 5em; } + ''; + }; + + wkhtmlpdf = { + command = { + program = "${pkgs.wkhtmltopdf}/bin/wkhtmltopdf"; + args = [ "-s" "A4" "--encoding" "UTF-8" "-" "{{outfile}}" ]; + timeout = "2 minutes"; + }; + working-dir = "/tmp/docspell-convert"; + }; + + tesseract = { + command = { + program = "${pkgs.tesseract4}/bin/tesseract"; + args = [ "{{infile}}" "out" "-l" "{{lang}}" "pdf" "txt" ]; + timeout = "5 minutes"; + }; + working-dir = "/tmp/docspell-convert"; + }; + + unoconv = { + command = { + program = "${pkgs.unoconv}/bin/unoconv"; + args = [ "-f" "pdf" "-o" "{{outfile}}" "{{infile}}" ]; + timeout = "2 minutes"; + }; + working-dir = "/tmp/docspell-convert"; + }; + + ocrmypdf = { + enabled = true; + command = { + program = "${pkgs.ocrmypdf}/bin/ocrmypdf"; + args = [ + "-l" + "{{lang}}" + "--skip-text" + "--deskew" + "-j" + "1" + "{{infile}}" + "{{outfile}}" + ]; + timeout = "5 minutes"; + }; + working-dir = "/tmp/docspell-convert"; + }; + }; + files = { + chunk-size = 524288; + valid-mime-types = [ ]; + }; + full-text-search = { + enabled = false; + backend = "solr"; + solr = { + url = "http://localhost:8983/solr/docspell"; + commit-within = 1000; + log-verbose = false; + def-type = "lucene"; + q-op = "OR"; + }; + postgresql = { + use-default-connection = false; + jdbc = { + url = "jdbc:postgresql://server:5432/db"; + user = "pguser"; + password = ""; + }; + pg-config = { }; + pg-query-parser = "websearch_to_tsquery"; + pg-rank-normalization = [ 4 ]; + }; + migration = { + index-all-chunk = 10; + }; + }; + addons = { + working-dir = "/tmp/docspell-addons-work"; + cache-dir = "/tmp/docspell-addons-cache"; + executor-config = { + runner = "nix-flake,docker,trivial"; + nspawn = { + enabled = false; + sudo-binary = "sudo"; + nspawn-binary = "systemd-nspawn"; + container-wait = "100 millis"; + }; + fail-fast = true; + run-timeout = "15 minutes"; + nix-runner = { + nix-binary = "${pkgs.nixFlakes}/bin/nix"; + build-timeout = "15 minutes"; + }; + docker-runner = { + docker-binary = "${pkgs.docker}/bin/docker"; + build-timeout = "15 minutes"; + }; + }; + }; + }; +in +{ + + ## interface + options = { + services.docspell-joex = { + enable = mkOption { + type = types.bool; + default = false; + description = "Whether to enable docspell docspell job executor."; + }; + runAs = mkOption { + type = types.nullOr types.str; + default = null; + description = '' + Specify a user for running the application. If null, a new + user is created. + ''; + }; + waitForTarget = mkOption { + type = types.nullOr types.str; + default = null; + description = '' + If not null, joex depends on this systemd target. This is + useful if full-text-search is enabled and the solr instance + is running on the same machine. + ''; + }; + jvmArgs = mkOption { + type = types.listOf types.str; + default = [ ]; + example = [ "-J-Xmx1G" ]; + description = "The options passed to the executable for setting jvm arguments."; + }; + + + app-id = mkOption { + type = types.str; + default = defaults.app-id; + description = "The node id. Must be unique across all docspell nodes."; + }; + + base-url = mkOption { + type = types.str; + default = defaults.base-url; + description = "The base url where attentive is deployed."; + }; + + bind = mkOption { + type = types.submodule ({ + options = { + address = mkOption { + type = types.str; + default = defaults.bind.address; + description = "The address to bind the REST server to."; + }; + port = mkOption { + type = types.int; + default = defaults.bind.port; + description = "The port to bind the REST server"; + }; + }; + }); + default = defaults.bind; + description = "Address and port bind the rest server."; + }; + + logging = mkOption { + type = types.submodule ({ + options = { + minimum-level = mkOption { + type = types.str; + default = defaults.logging.minimum-level; + description = "The minimum level for logging to control verbosity."; + }; + format = mkOption { + type = types.str; + default = defaults.logging.format; + description = "The log format. One of: Fancy, Plain, Json or Logfmt"; + }; + levels = mkOption { + type = types.attrs; + default = defaults.logging.levels; + description = "Set of logger and their levels"; + }; + }; + }); + default = defaults.logging; + description = "Settings for logging"; + }; + + mail-debug = mkOption { + type = types.bool; + default = defaults.mail-debug; + description = '' + Enable or disable debugging for e-mail related functionality. This + applies to both sending and receiving mails. For security reasons + logging is not very extensive on authentication failures. Setting + this to true, results in a lot of data printed to stdout. + ''; + }; + + jdbc = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.jdbc.url; + description = '' + The URL to the database. By default a file-based database is + used. It should also work with mariadb and postgresql. + + Examples: + "jdbc:mariadb://192.168.1.172:3306/docspell" + "jdbc:postgresql://localhost:5432/docspell" + "jdbc:h2:///home/dbs/docspell.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE" + + ''; + }; + user = mkOption { + type = types.str; + default = defaults.jdbc.user; + description = "The user name to connect to the database."; + }; + password = mkOption { + type = types.str; + default = defaults.jdbc.password; + description = "The password to connect to the database."; + }; + }; + }); + default = defaults.jdbc; + description = "Database connection settings"; + }; + + send-mail = mkOption { + type = types.submodule ({ + options = { + list-id = mkOption { + type = types.str; + default = defaults.send-mail.list-id; + description = '' + This is used as the List-Id e-mail header when mails are sent + from docspell to its users (example: for notification mails). It + is not used when sending to external recipients. If it is empty, + no such header is added. Using this header is often useful when + filtering mails. + + It should be a string in angle brackets. See + https://tools.ietf.org/html/rfc2919 for a formal specification + ''; + }; + + }; + }); + default = defaults.send-mail; + description = "Settings for sending mails."; + }; + + scheduler = mkOption { + type = types.submodule ({ + options = { + pool-size = mkOption { + type = types.int; + default = defaults.scheduler.pool-size; + description = "Number of processing allowed in parallel."; + }; + counting-scheme = mkOption { + type = types.str; + default = defaults.scheduler.counting-scheme; + description = '' + A counting scheme determines the ratio of how high- and low-prio + jobs are run. For example: 4,1 means run 4 high prio jobs, then + 1 low prio and then start over. + ''; + }; + retries = mkOption { + type = types.int; + default = defaults.scheduler.retries; + description = '' + How often a failed job should be retried until it enters failed + state. If a job fails, it becomes "stuck" and will be retried + after a delay. + ''; + }; + retry-delay = mkOption { + type = types.str; + default = defaults.scheduler.retry-delay; + description = '' + The delay until the next try is performed for a failed job. This + delay is increased exponentially with the number of retries. + ''; + }; + log-buffer-size = mkOption { + type = types.int; + default = defaults.scheduler.log-buffer-size; + description = '' + The queue size of log statements from a job. + ''; + }; + wakeup-period = mkOption { + type = types.str; + default = defaults.scheduler.wakeup-period; + description = '' + If no job is left in the queue, the scheduler will wait until a + notify is requested (using the REST interface). To also retry + stuck jobs, it will notify itself periodically. + ''; + }; + }; + }); + default = defaults.scheduler; + description = "Settings for the scheduler"; + }; + + periodic-scheduler = mkOption { + type = types.submodule ({ + options = { + wakeup-period = mkOption { + type = types.str; + default = defaults.periodic-scheduler.wakeup-period; + description = '' + A fallback to start looking for due periodic tasks regularily. + Usually joex instances should be notified via REST calls if + external processes change tasks. But these requests may get + lost. + ''; + }; + }; + }); + default = defaults.periodic-scheduler; + description = '' + Settings for the periodic scheduler. + ''; + }; + + user-tasks = mkOption { + type = types.submodule ({ + options = { + scan-mailbox = mkOption { + type = types.submodule ({ + options = { + max-folders = mkOption { + type = types.int; + default = defaults.user-tasks.scan-mailbox.max-folders; + description = '' + A limit of how many folders to scan through. If a user + configures more than this, only upto this limit folders are + scanned and a warning is logged. + ''; + }; + mail-chunk-size = mkOption { + type = types.int; + default = defaults.user-tasks.scan-mailbox.mail-chunk-size; + description = '' + How many mails (headers only) to retrieve in one chunk. + + If this is greater than `max-mails' it is set automatically to + the value of `max-mails'. + ''; + }; + max-mails = mkOption { + type = types.int; + default = defaults.user-tasks.scan-mailbox.max-mails; + description = '' + A limit on how many mails to process in one job run. This is + meant to avoid too heavy resource allocation to one + user/collective. + + If more than this number of mails is encountered, a warning is + logged. + ''; + }; + }; + }); + default = defaults.user-tasks.scan-mailbox; + description = "Allows to import e-mails by scanning a mailbox."; + }; + }; + }); + default = defaults.user-tasks; + description = "Configuration for the user tasks."; + }; + + house-keeping = mkOption { + type = types.submodule ({ + options = { + schedule = mkOption { + type = types.str; + default = defaults.house-keeping.schedule; + description = '' + When the house keeping tasks execute. Default is to run every + week. + ''; + }; + cleanup-invites = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.house-keeping.cleanup-invites.enabled; + description = "Whether this task is enabled."; + }; + older-than = mkOption { + type = types.str; + default = defaults.house-keeping.cleanup-invites.older-than; + description = "The minimum age of invites to be deleted."; + }; + }; + }); + default = defaults.house-keeping.cleanup-invites; + description = '' + This task removes invitation keys that have been created but not + used. The timespan here must be greater than the `invite-time' + setting in the rest server config file. + ''; + }; + cleanup-jobs = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.house-keeping.cleanup-jobs.enabled; + description = "Whether this task is enabled."; + }; + older-than = mkOption { + type = types.str; + default = defaults.house-keeping.cleanup-jobs.older-than; + description = '' + The minimum age of jobs to delete. It is matched against the + `finished' timestamp. + ''; + }; + delete-batch = mkOption { + type = types.int; + default = defauts.house-keeping.cleanup-jobs.delete-batch; + description = '' + This defines how many jobs are deleted in one transaction. + Since the data to delete may get large, it can be configured + whether more or less memory should be used. + ''; + }; + + }; + }); + default = defaults.house-keeping.cleanup-jobs; + description = '' + Jobs store their log output in the database. Normally this data + is only interesting for some period of time. The processing logs + of old files can be removed eventually. + ''; + }; + cleanup-remember-me = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.house-keeping.cleanup-remember-me.enabled; + description = "Whether this task is enabled."; + }; + older-than = mkOption { + type = types.str; + default = defaults.house-keeping.cleanup-remember-me.older-than; + description = "The miminum age of remember me tokens to delete."; + }; + }; + }); + default = defaults.house-keeping.cleanup-remember-me; + description = "Settings for cleaning up remember me tokens."; + }; + + cleanup-downloads = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.house-keeping.cleanup-downloads.enabled; + description = "Whether this task is enabled."; + }; + older-than = mkOption { + type = types.str; + default = defaults.house-keeping.cleanup-downloads.older-than; + description = "The miminum age of a download file to delete."; + }; + }; + }); + default = defaults.house-keeping.cleanup-downloads; + description = ""; + }; + + check-nodes = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.house-keeping.check-nodes.enabled; + description = "Whether this task is enabled."; + }; + min-not-found = mkOption { + type = types.int; + default = defaults.house-keeping.check-nodes.min-not-found; + description = "How often the node must be unreachable, before it is removed."; + }; + }; + }); + default = defaults.house-keeping.cleanup-nodes; + description = "Removes node entries that are not reachable anymore."; + }; + }; + }); + default = defaults.house-keeping; + description = '' + Docspell uses periodic house keeping tasks, like cleaning expired + invites, that can be configured here. + ''; + }; + + update-check = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.update-check.enabled; + description = "Whether this task is enabled."; + }; + test-run = mkOption { + type = types.bool; + default = defaults.update-check.test-run; + description = '' + Sends the mail without checking the latest release. Can be used + if you want to see if mail sending works, but don't want to wait + until a new release is published. + ''; + }; + schedule = mkOption { + type = types.str; + default = defaults.update-check.schedule; + description = '' + When the check-update task should execute. Default is to run every + week. + ''; + }; + sender-account = mkOption { + type = types.str; + default = defaults.update-check.sender-account; + description = '' + An account id in form of `collective/user` (or just `user` if + collective and user name are the same). This user account must + have at least one valid SMTP settings which are used to send the + mail. + ''; + }; + smtp-id = mkOption { + type = types.str; + default = defaults.update-check.smtp-id; + description = '' + The SMTP connection id that should be used for sending the mail. + ''; + }; + recipients = mkOption { + type = types.listOf types.str; + default = defaults.update-check.recipients; + example = [ "josh.doe@gmail.com" ]; + description = '' + A list of recipient e-mail addresses. + ''; + }; + subject = mkOption { + type = types.str; + default = defaults.update-check.subject; + description = '' + The subject of the mail. It supports the same variables as the body. + ''; + }; + body = mkOption { + type = types.str; + default = defaults.update-check.body; + description = '' + The body of the mail. Subject and body can contain these + variables which are replaced: + + - `latestVersion` the latest available version of Docspell + - `currentVersion` the currently running (old) version of Docspell + - `releasedAt` a date when the release was published + + The body is processed as markdown after the variables have been + replaced. + ''; + }; + }; + }); + default = defaults.update-check; + description = '' + A periodic task to check for new releases of docspell. It can + inform about a new release via e-mail. You need to specify an + account that has SMTP settings to use for sending. + ''; + }; + + extraction = mkOption { + type = types.submodule ({ + options = { + pdf = mkOption { + type = types.submodule ({ + options = { + min-text-len = mkOption { + type = types.int; + default = defaults.extraction.pdf.min-text-len; + description = '' + For PDF files it is first tried to read the text parts of the + PDF. But PDFs can be complex documents and they may contain text + and images. If the returned text is shorter than the value + below, OCR is run afterwards. Then both extracted texts are + compared and the longer will be used. + ''; + }; + }; + }); + default = defaults.extraction.pdf; + description = "Settings for PDF extraction"; + }; + preview = mkOption { + type = types.submodule ({ + options = { + dpi = mkOption { + type = types.int; + default = defaults.extraction.preview.dpi; + description = '' + When rendering a pdf page, use this dpi. This results in + scaling the image. A standard A4 page rendered at 96dpi + results in roughly 790x1100px image. Using 32 results in + roughly 200x300px image. + + Note, when this is changed, you might want to re-generate + preview images. Check the api for this, there is an endpoint + to regenerate all for a collective. + ''; + }; + }; + }); + default = defaults.extraction.preview; + description = ""; + }; + ocr = mkOption { + type = types.submodule ({ + options = { + max-image-size = mkOption { + type = types.int; + default = defaults.extraction.ocr.max-image-size; + description = '' + Images greater than this size are skipped. Note that every + image is loaded completely into memory for doing OCR. + ''; + }; + page-range = mkOption { + type = types.submodule ({ + options = { + begin = mkOption { + type = types.int; + default = defaults.extraction.page-range.begin; + description = "Specifies the first N pages of a file to process."; + }; + }; + }); + default = defaults.extraction.page-range; + description = '' + Defines what pages to process. If a PDF with 600 pages is + submitted, it is probably not necessary to scan through all of + them. This would take a long time and occupy resources for no + value. The first few pages should suffice. The default is first + 10 pages. + + If you want all pages being processed, set this number to -1. + + Note: if you change the ghostscript command below, be aware that + this setting (if not -1) will add another parameter to the + beginning of the command. + ''; + }; + ghostscript = mkOption { + type = types.submodule ({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.extraction.ghostscript.working-dir; + description = "Directory where the extraction processes can put their temp files"; + }; + command = mkOption { + type = types.submodule ({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.ghostscript.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.ghostscript.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.ghostscript.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.ghostscript.command; + description = "The system command"; + }; + }; + }); + default = defaults.extraction.ghostscript; + description = "The ghostscript command."; + }; + unpaper = mkOption { + type = types.submodule ({ + options = { + command = mkOption { + type = types.submodule ({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.unpaper.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.unpaper.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.unpaper.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.unpaper.command; + description = "The system command"; + }; + }; + }); + default = defaults.extraction.unpaper; + description = "The unpaper command."; + }; + tesseract = mkOption { + type = types.submodule ({ + options = { + command = mkOption { + type = types.submodule ({ + options = { + program = mkOption { + type = types.str; + default = defaults.extraction.tesseract.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.extraction.tesseract.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.extraction.tesseract.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.extraction.tesseract.command; + description = "The system command"; + }; + }; + }); + default = defaults.extraction.tesseract; + description = "The tesseract command."; + }; + + }; + }); + default = defaults.extraction.ocr; + description = ""; + }; + }; + }); + default = defaults.extraction; + description = '' + Configuration of text extraction + + Extracting text currently only work for image and pdf files. It + will first runs ghostscript to create a gray image from a + pdf. Then unpaper is run to optimize the image for the upcoming + ocr, which will be done by tesseract. All these programs must be + available in your PATH or the absolute path can be specified + below. + ''; + }; + + text-analysis = mkOption { + type = types.submodule ({ + options = { + max-length = mkOption { + type = types.int; + default = defaults.text-analysis.max-length; + description = '' + Maximum length of text to be analysed. + + All text to analyse must fit into RAM. A large document may take + too much heap. Also, most important information is at the + beginning of a document, so in most cases the first two pages + should suffice. Default is 10000, which are about 2-3 pages + (a rough guess). + ''; + }; + working-dir = mkOption { + type = types.str; + default = defaults.text-analysis.working-dir; + description = '' + A working directory for the analyser to store temporary/working + files. + ''; + }; + + nlp = mkOption { + type = types.submodule ({ + options = { + mode = mkOption { + type = types.str; + default = defaults.text-analysis.nlp.mode; + description = '' + The mode for configuring NLP models: + + 1. full – builds the complete pipeline + 2. basic - builds only the ner annotator + 3. regexonly - matches each entry in your address book via regexps + 4. disabled - doesn't use any stanford-nlp feature + + The full and basic variants rely on pre-build language models + that are available for only 3 lanugages at the moment: German, + English, French and Spanish. + + Memory usage varies greatly among the languages. German has + quite large models, that require about 1G heap. So joex should + run with -Xmx1400M at least when using mode=full. + + The basic variant does a quite good job for German and + English. It might be worse for French, always depending on the + type of text that is analysed. Joex should run with about 600M + heap, here again lanugage German uses the most. + + The regexonly variant doesn't depend on a language. It roughly + works by converting all entries in your addressbook into + regexps and matches each one against the text. This can get + memory intensive, too, when the addressbook grows large. This + is included in the full and basic by default, but can be used + independently by setting mode=regexner. + + When mode=disabled, then the whole nlp pipeline is disabled, + and you won't get any suggestions. Only what the classifier + returns (if enabled). + ''; + }; + + max-due-date-years = mkOption { + type = types.int; + default = defaults.processing.max-due-date-years; + description = '' + Restricts proposalsfor due dates. Only dates earlier than this + number of years in the future are considered. + ''; + }; + + clear-interval = mkOption { + type = types.str; + default = defaults.text-analysis.nlp.clear-interval; + description = '' + Idle time after which the NLP caches are cleared to free + memory. If <= 0 clearing the cache is disabled. + ''; + }; + + regex-ner = mkOption { + type = types.submodule ({ + options = { + max-entries = mkOption { + type = types.int; + default = defaults.text-analysis.regex-ner.max-entries; + description = '' + Whether to enable custom NER annotation. This uses the + address book of a collective as input for NER tagging (to + automatically find correspondent and concerned entities). If + the address book is large, this can be quite memory + intensive and also makes text analysis much slower. But it + improves accuracy and can be used independent of the + lanugage. If this is set to 0, it is effectively disabled + and NER tagging uses only statistical models (that also work + quite well, but are restricted to the languages mentioned + above). + + Note, this is only relevant if nlp-config.mode is not + "disabled". + ''; + }; + file-cache-time = mkOption { + type = types.str; + default = defaults.text-analysis.ner-file-cache-time; + description = '' + The NER annotation uses a file of patterns that is derived from + a collective's address book. This is is the time how long this + file will be kept until a check for a state change is done. + ''; + }; + }; + }); + default = defaults.text-analysis.nlp.regex-ner; + description = ""; + }; + }; + }); + default = defaults.text-analysis.nlp; + description = "Configure NLP"; + }; + + classification = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.text-analysis.classification.enabled; + description = '' + Whether to enable classification globally. Each collective can + decide to disable it. If it is disabled here, no collective + can use classification. + ''; + }; + item-count = mkOption { + type = types.int; + default = defaults.text-analysis.classification.item-count; + description = '' + If concerned with memory consumption, this restricts the + number of items to consider. More are better for training. A + negative value or zero means no train on all items. + ''; + }; + classifiers = mkOption { + type = types.listOf types.attrs; + default = defaults.text-analysis.classification.classifiers; + description = '' + These settings are used to configure the classifier. If + multiple are given, they are all tried and the "best" is + chosen at the end. See + https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/classify/ColumnDataClassifier.html + for more info about these settings. The settings here yielded + good results with *my* dataset. + ''; + }; + + }; + }); + default = defaults.text-analysis.classification; + description = '' + Settings for doing document classification. + + This works by learning from existing documents. A collective can + specify a tag category and the system will try to predict a tag + from this category for new incoming documents. + + This requires a satstical model that is computed from all + existing documents. This process is run periodically as + configured by the collective. It may require a lot of memory, + depending on the amount of data. + + It utilises this NLP library: https://nlp.stanford.edu/. + ''; + }; + }; + }); + default = defaults.text-analysis; + description = "Settings for text analysis"; + }; + + convert = mkOption { + type = types.submodule ({ + options = { + chunk-size = mkOption { + type = types.int; + default = defaults.convert.chunk-size; + description = '' + The chunk size used when storing files. This should be the same + as used with the rest server. + ''; + }; + converted-filename-part = mkOption { + type = types.str; + default = defaults.convert.converted-filename-part; + description = '' + A string used to change the filename of the converted pdf file. + If empty, the original file name is used for the pdf file ( the + extension is always replaced with `pdf`). + ''; + }; + + max-image-size = mkOption { + type = types.int; + default = defaults.convert.max-image-size; + description = '' + When reading images, this is the maximum size. Images that are + larger are not processed. + ''; + }; + markdown = mkOption { + type = types.submodule ({ + options = { + internal-css = mkOption { + type = types.str; + default = defaults.convert.markdown.internal-css; + description = '' + The CSS that is used to style the resulting HTML. + ''; + }; + }; + }); + default = defaults.convert.markdown; + description = '' + Settings when processing markdown files (and other text files) + to HTML. + + In order to support text formats, text files are first converted + to HTML using a markdown processor. The resulting HTML is then + converted to a PDF file. + ''; + }; + wkhtmlpdf = mkOption { + type = types.submodule ({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.wktmlpdf.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule ({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.wkhtmlpdf.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.wkhtmlpdf.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.wkhtmlpdf.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.wkhtmlpdf.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.wkhtmlpdf; + description = '' + To convert HTML files into PDF files, the external tool + wkhtmltopdf is used. + ''; + }; + tesseract = mkOption { + type = types.submodule ({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.tesseract.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule ({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.tesseract.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.tesseract.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.tesseract.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.tesseract.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.tesseract; + description = '' + To convert image files to PDF files, tesseract is used. This + also extracts the text in one go. + ''; + }; + unoconv = mkOption { + type = types.submodule ({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.convert.unoconv.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule ({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.unoconv.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.unoconv.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.unoconv.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.unoconv.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.unoconv; + description = '' + To convert "office" files to PDF files, the external tool + unoconv is used. Unoconv uses libreoffice/openoffice for + converting. So it supports all formats that are possible to read + with libreoffice/openoffic. + + Note: to greatly improve performance, it is recommended to start + a libreoffice listener by running `unoconv -l` in a separate + process. + ''; + }; + + ocrmypdf = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.convert.ocrmypdf.enabled; + description = "Whether to use ocrmypdf to convert pdf to pdf/a."; + }; + working-dir = mkOption { + type = types.str; + default = defaults.convert.ocrmypdf.working-dir; + description = "Directory where the conversion processes can put their temp files"; + }; + command = mkOption { + type = types.submodule ({ + options = { + program = mkOption { + type = types.str; + default = defaults.convert.ocrmypdf.command.program; + description = "The path to the executable."; + }; + args = mkOption { + type = types.listOf types.str; + default = defaults.convert.ocrmypdf.command.args; + description = "The arguments to the program"; + }; + timeout = mkOption { + type = types.str; + default = defaults.convert.ocrmypdf.command.timeout; + description = "The timeout when executing the command"; + }; + }; + }); + default = defaults.convert.ocrmypdf.command; + description = "The system command"; + }; + }; + }); + default = defaults.convert.orcmypdf; + description = '' + The tool ocrmypdf can be used to convert pdf files to pdf files + in order to add extracted text as a separate layer. This makes + image-only pdfs searchable and you can select and copy/paste the + text. It also converts pdfs into pdf/a type pdfs, which are best + suited for archiving. So it makes sense to use this even for + text-only pdfs. + + It is recommended to install ocrympdf, but it also is optional. + If it is enabled but fails, the error is not fatal and the + processing will continue using the original pdf for extracting + text. You can also disable it to remove the errors from the + processing logs. + + The `--skip-text` option is necessary to not fail on "text" pdfs + (where ocr is not necessary). In this case, the pdf will be + converted to PDF/A. + ''; + }; + + }; + }); + default = defaults.convert; + description = '' + Configuration for converting files into PDFs. + + Most of it is delegated to external tools, which can be configured + below. They must be in the PATH environment or specify the full + path below via the `program` key. + ''; + }; + files = mkOption { + type = types.submodule ({ + options = { + chunk-size = mkOption { + type = types.int; + default = defaults.files.chunk-size; + description = '' + Defines the chunk size (in bytes) used to store the files. + This will affect the memory footprint when uploading and + downloading files. At most this amount is loaded into RAM for + down- and uploading. + + It also defines the chunk size used for the blobs inside the + database. + ''; + }; + valid-mime-types = mkOption { + type = types.listOf types.str; + default = defaults.files.valid-mime-types; + description = '' + The file content types that are considered valid. Docspell + will only pass these files to processing. The processing code + itself has also checks for which files are supported and which + not. This affects the uploading part and is a first check to + avoid that 'bad' files get into the system. + ''; + }; + }; + }); + default = defaults.files; + description = "Settings for how files are stored."; + }; + full-text-search = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.full-text-search.enabled; + description = '' + The full-text search feature can be disabled. It requires an + additional index server which needs additional memory and disk + space. It can be enabled later any time. + + Currently the SOLR search platform is supported. + ''; + }; + backend = mkOption { + type = types.str; + default = defaults.full-text-search.backend; + description = "The backend to use, either solr or postgresql"; + }; + + solr = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.full-text-search.solr.url; + description = "The URL to solr"; + }; + commit-within = mkOption { + type = types.int; + default = defaults.full-text-search.solr.commit-within; + description = "Used to tell solr when to commit the data"; + }; + log-verbose = mkOption { + type = types.bool; + default = defaults.full-text-search.solr.log-verbose; + description = "If true, logs request and response bodies"; + }; + def-type = mkOption { + type = types.str; + default = defaults.full-text-search.solr.def-type; + description = '' + The defType parameter to lucene that defines the parser to + use. You might want to try "edismax" or look here: + https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing + ''; + }; + q-op = mkOption { + type = types.str; + default = defaults.full-text-search.solr.q-op; + description = "The default combiner for tokens. One of {AND, OR}."; + }; + }; + }); + default = defaults.full-text-search.solr; + description = "Configuration for the SOLR backend."; + }; + + postgresql = mkOption { + type = types.submodule ({ + options = { + use-default-connection = mkOption { + type = types.bool; + default = defaults.full-text-search.postgresql.use-default-connection; + description = "Whether to use the primary db connection."; + }; + jdbc = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.jdbc.url; + description = '' + The URL to the database. + ''; + }; + user = mkOption { + type = types.str; + default = defaults.jdbc.user; + description = "The user name to connect to the database."; + }; + password = mkOption { + type = types.str; + default = defaults.jdbc.password; + description = "The password to connect to the database."; + }; + }; + }); + default = defaults.full-text-search.postgresql.jdbc; + description = "Database connection settings"; + }; + pg-config = mkOption { + type = types.attrs; + default = defaults.full-text-search.postgresql.pg-config; + description = ""; + }; + pg-query-parser = mkOption { + type = types.str; + default = defaults.full-text-search.postgresql.pg-query-parser; + description = ""; + }; + pg-rank-normalization = mkOption { + type = types.listOf types.int; + default = defaults.full-text-search.postgresql.pg-rank-normalization; + description = ""; + }; + }; + }); + default = defaults.full-text-search.postgresql; + description = "PostgreSQL for fulltext search"; + }; + + migration = mkOption { + type = types.submodule ({ + options = { + index-all-chunk = mkOption { + type = types.int; + default = defaults.full-text-search.migration.index-all-chunk; + description = '' + Chunk size to use when indexing data from the database. This + many attachments are loaded into memory and pushed to the + full-text index. + ''; + }; + }; + }); + default = defaults.full-text-search.migration; + description = "Settings for running the index migration tasks"; + }; + }; + }); + default = defaults.full-text-search; + description = "Configuration for full-text search."; + }; + addons = mkOption { + type = types.submodule ({ + options = { + working-dir = mkOption { + type = types.str; + default = defaults.addons.working-dir; + description = "Working directory"; + }; + cache-dir = mkOption { + type = types.str; + default = defaults.addons.cache-dir; + description = "Cache directory"; + }; + executor-config = mkOption { + type = types.submodule ({ + options = { + runner = mkOption { + type = types.str; + default = defaults.addons.executor-config.runner; + description = "The supported runners by this joex"; + }; + fail-fast = mkOption { + type = types.bool; + default = defaults.addons.executor-config.fail-fast; + description = ""; + }; + run-timeout = mkOption { + type = types.str; + default = defaults.addons.executor-config.run-timeout; + description = ""; + }; + nspawn = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.addons.nspawn.enabled; + description = "Enable to use systemd-nspawn"; + }; + sudo-binary = mkOption { + type = types.str; + default = defaults.addons.executor-config.nspawn.sudo-binary; + description = ""; + }; + nspawn-binary = mkOption { + type = types.str; + default = defaults.addons.executor-config.nspawn.nspawn-binary; + description = ""; + }; + container-wait = mkOption { + type = types.str; + default = defaults.addons.executor-config.nspawn.container-wait; + description = ""; + }; + }; + }); + default = defaults.addons.executor-config.nspawn; + description = ""; + }; + nix-runner = mkOption { + type = types.submodule ({ + options = { + nix-binary = mkOption { + type = types.str; + default = defaults.addons.executor-config.nix-runner.nix-binary; + description = ""; + }; + build-timeout = mkOption { + type = types.str; + default = defaults.addons.executor-config.nix-runner.build-timeout; + description = ""; + }; + }; + }); + default = defaults.addons.executor-config.nix-runner; + description = ""; + }; + docker-runner = mkOption { + type = types.submodule ({ + options = { + docker-binary = mkOption { + type = types.str; + default = defaults.addons.executor-config.docker-runner.docker-binary; + description = ""; + }; + build-timeout = mkOption { + type = types.str; + default = defaults.addons.executor-config.docker-runner.build-timeout; + description = ""; + }; + }; + }); + default = defaults.addons.executor-config.docker-runner; + description = ""; + }; + }; + }); + default = defaults.addons.executor-config; + description = ""; + }; + }; + }); + default = defaults.addons; + description = "Addon executor config"; + }; + }; + }; + + ## implementation + config = mkIf config.services.docspell-joex.enable { + + nixpkgs.overlays = [ overlay ]; + + users.users."${user}" = mkIf (cfg.runAs == null) { + name = user; + isSystemUser = true; + createHome = true; + home = "/var/docspell"; + description = "Docspell user"; + group = user; + }; + users.groups."${user}" = mkIf (cfg.runAs == null) { }; + + # Setting up a unoconv listener to improve conversion performance + systemd.services.unoconv = + let + cmd = "${pkgs.unoconv}/bin/unoconv --listener -v"; + in + { + description = "Unoconv Listener"; + after = [ "networking.target" ]; + wantedBy = [ "multi-user.target" ]; + serviceConfig = { + Restart = "always"; + }; + script = + "${pkgs.su}/bin/su -s ${pkgs.bash}/bin/sh ${user} -c \"${cmd}\""; + }; + + systemd.services.docspell-joex = + let + args = builtins.concatStringsSep " " cfg.jvmArgs; + cmd = "${pkgs.docspell-joex}/bin/docspell-joex ${args} -- ${configFile}"; + waitTarget = + if cfg.waitForTarget != null + then + [ cfg.waitForTarget ] + else + [ ]; + in + { + description = "Docspell Joex"; + after = ([ "networking.target" ] ++ waitTarget); + wantedBy = [ "multi-user.target" ]; + path = [ pkgs.gawk ]; + + script = + "${pkgs.su}/bin/su -s ${pkgs.bash}/bin/sh ${user} -c \"${cmd}\""; + }; + }; +} diff --git a/nix/flake/modules/server.nix b/nix/flake/modules/server.nix new file mode 100644 index 0000000000..33e3aaa796 --- /dev/null +++ b/nix/flake/modules/server.nix @@ -0,0 +1,877 @@ +overlay: { config, lib, pkgs, ... }: + +with lib; +let + cfg = config.services.docspell-restserver; + user = if cfg.runAs == null then "docspell" else cfg.runAs; + configFile = pkgs.writeText "docspell-server.conf" '' + {"docspell": {"server": + ${builtins.toJSON cfg} + }} + ''; + defaults = { + app-name = "Docspell"; + app-id = "rest1"; + base-url = "http://localhost:7880"; + internal-url = "http://localhost:7880"; + max-item-page-size = 200; + max-note-length = 180; + show-classification-settings = true; + bind = { + address = "localhost"; + port = 7880; + }; + server-options = { + enable-http-2 = false; + max-connections = 1024; + response-timeout = "45s"; + }; + logging = { + minimum-level = "Info"; + format = "Fancy"; + levels = { + "docspell" = "Info"; + "org.flywaydb" = "Info"; + "binny" = "Info"; + "org.http4s" = "Info"; + }; + }; + integration-endpoint = { + enabled = false; + priority = "low"; + source-name = "integration"; + allowed-ips = { + enabled = false; + ips = [ "127.0.0.1" ]; + }; + http-basic = { + enabled = false; + realm = "Docspell Integration"; + user = "docspell-int"; + password = "docspell-int"; + }; + http-header = { + enabled = false; + header-name = "Docspell-Integration"; + header-value = "some-secret"; + }; + }; + admin-endpoint = { + secret = ""; + }; + full-text-search = { + enabled = false; + backend = "solr"; + solr = { + url = "http://localhost:8983/solr/docspell"; + commit-within = 1000; + log-verbose = false; + def-type = "lucene"; + q-op = "OR"; + }; + postgresql = { + use-default-connection = false; + jdbc = { + url = "jdbc:postgresql://server:5432/db"; + user = "pguser"; + password = ""; + }; + pg-config = { }; + pg-query-parser = "websearch_to_tsquery"; + pg-rank-normalization = [ 4 ]; + }; + }; + auth = { + server-secret = "hex:caffee"; + session-valid = "5 minutes"; + remember-me = { + enabled = true; + valid = "30 days"; + }; + }; + download-all = { + max-files = 500; + max-size = "1400M"; + }; + openid = { + enabled = false; + display = ""; + provider = { + provider-id = null; + client-id = null; + client-secret = null; + scope = "profile"; + authorize-url = null; + token-url = null; + user-url = null; + sign-key = ""; + sig-algo = "RS256"; + }; + user-key = "preferred_username"; + collective-key = "lookup:preferred_username"; + }; + backend = { + mail-debug = false; + jdbc = { + url = "jdbc:h2:///tmp/docspell-demo.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE"; + user = "sa"; + password = ""; + }; + signup = { + mode = "open"; + new-invite-password = ""; + invite-time = "3 days"; + }; + files = { + chunk-size = 524288; + valid-mime-types = [ ]; + }; + addons = { + enabled = false; + allow-impure = true; + allowed-urls = [ "*" ]; + denied-urls = [ ]; + }; + }; + }; +in +{ + + ## interface + options = { + services.docspell-restserver = { + enable = mkOption { + default = false; + description = "Whether to enable docspell."; + }; + runAs = mkOption { + type = types.nullOr types.str; + default = null; + description = '' + Specify a user for running the application. If null, a new + user is created. + ''; + }; + jvmArgs = mkOption { + type = types.listOf types.str; + default = [ ]; + example = [ "-J-Xmx1G" ]; + description = "The options passed to the executable for setting jvm arguments."; + }; + + + app-name = mkOption { + type = types.str; + default = defaults.app-name; + description = "The name used in the web ui and in notification mails."; + }; + + app-id = mkOption { + type = types.str; + default = defaults.app-id; + description = '' + This is the id of this node. If you run more than one server, you + have to make sure to provide unique ids per node. + ''; + }; + + base-url = mkOption { + type = types.str; + default = defaults.base-url; + description = '' + This is the base URL this application is deployed to. This is used + to create absolute URLs and to configure the cookie. + ''; + }; + + internal-url = mkOption { + type = types.str; + default = defaults.internal-url; + description = '' + This url is the base url for reaching this server internally. + While you might set `base-url` to some external address (like + mydocs.myserver.com), the `internal-url` must be set such that + other nodes can reach this server. + ''; + }; + + max-item-page-size = mkOption { + type = types.int; + default = defaults.max-item-page-size; + description = '' + This is a hard limit to restrict the size of a batch that is + returned when searching for items. The user can set this limit + within the client config, but it is restricted by the server to + the number defined here. An admin might choose a lower number + depending on the available resources. + ''; + }; + + max-note-length = mkOption { + type = types.int; + default = defaults.max-note-length; + description = '' + The number of characters to return for each item notes when + searching. Item notes may be very long, when returning them with + all the results from a search, they add quite some data to return. + In order to keep this low, a limit can be defined here. + ''; + }; + + show-classification-settings = mkOption { + type = types.bool; + default = defaults.show-classification-settings; + description = '' + This defines whether the classification form in the collective + settings is displayed or not. If all joex instances have document + classification disabled, it makes sense to hide its settings from + users. + ''; + }; + + bind = mkOption { + type = types.submodule ({ + options = { + address = mkOption { + type = types.str; + default = defaults.bind.address; + description = "The address to bind the REST server to."; + }; + port = mkOption { + type = types.int; + default = defaults.bind.port; + description = "The port to bind the REST server"; + }; + }; + }); + default = defaults.bind; + description = "Address and port bind the rest server."; + }; + + server-options = mkOption { + type = types.submodule ({ + options = { + enable-http-2 = mkOption { + type = types.bool; + default = defaults.server-options.enable-http-2; + description = "Whether to enable http2"; + }; + max-connections = mkOption { + type = types.int; + default = defaults.server-options.max-connections; + description = "Maximum number of client connections"; + }; + response-timeout = mkOption { + type = types.str; + default = defaults.server-options.response-timeout; + description = "Timeout when waiting for the response."; + }; + }; + }); + default = defaults.server-options; + description = "Tuning the http server"; + }; + + logging = mkOption { + type = types.submodule ({ + options = { + minimum-level = mkOption { + type = types.str; + default = defaults.logging.minimum-level; + description = "The minimum level for logging to control verbosity."; + }; + format = mkOption { + type = types.str; + default = defaults.logging.format; + description = "The log format. One of: Fancy, Plain, Json or Logfmt"; + }; + levels = mkOption { + type = types.attrs; + default = defaults.logging.levels; + description = "Set of logger and their levels"; + }; + }; + }); + default = defaults.logging; + description = "Settings for logging"; + }; + + auth = mkOption { + type = types.submodule ({ + options = { + server-secret = mkOption { + type = types.str; + default = defaults.auth.server-secret; + description = '' + The secret for this server that is used to sign the authenicator + tokens. If multiple servers are running, all must share the same + secret. You can use base64 or hex strings (prefix with b64: and + hex:, respectively). + ''; + }; + session-valid = mkOption { + type = types.str; + default = defaults.auth.session-valid; + description = '' + How long an authentication token is valid. The web application + will get a new one periodically. + ''; + }; + remember-me = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.auth.remember-me.enabled; + description = "Whether to enable remember me."; + }; + valid = mkOption { + type = types.str; + default = defaults.auth.remember-me.valid; + description = "The time a remember-me token is valid."; + }; + }; + }); + default = defaults.auth.remember-me; + description = "Settings for Remember-Me"; + }; + }; + }); + default = defaults.auth; + description = "Authentication"; + }; + + download-all = mkOption { + type = types.submodule ({ + options = { + max-files = mkOption { + type = types.int; + default = defaults.download-all.max-files; + description = "How many files to allow in a zip."; + }; + max-size = mkOption { + type = types.str; + default = defaults.download-all.max-size; + description = "The maximum (uncompressed) size of the zip file contents."; + }; + }; + }); + default = defaults.download-all; + description = ""; + }; + + openid = mkOption { + type = types.listOf (types.submodule { + options = { + enabled = mkOption { + type = types.bool; + default = defaults.openid.enabled; + description = "Whether to use these settings."; + }; + display = mkOption { + type = types.str; + default = defaults.openid.display; + example = "via Keycloak"; + description = "The name for the button on the login page."; + }; + user-key = mkOption { + type = types.str; + default = defaults.openid.user-key; + description = "The key to retrieve the username"; + }; + collective-key = mkOption { + type = types.str; + default = defaults.openid.collective-key; + description = "How to retrieve the collective name."; + }; + provider = mkOption { + type = (types.submodule { + options = { + provider-id = mkOption { + type = types.str; + default = defaults.openid.provider.provider-id; + example = "keycloak"; + description = "The id of the provider, used in the URL and to distinguish other providers."; + }; + client-id = mkOption { + type = types.str; + default = defaults.openid.provider.client-id; + description = "The client-id as registered at the OP."; + }; + client-secret = mkOption { + type = types.str; + default = defaults.openid.provider.client-secret; + description = "The client-secret as registered at the OP."; + }; + scope = mkOption { + type = types.str; + default = defaults.openid.provider.scope; + description = "A scope to define what data to return from OP"; + }; + authorize-url = mkOption { + type = types.str; + default = defaults.openid.provider.authorize-url; + description = "The URL used to authenticate the user"; + }; + token-url = mkOption { + type = types.str; + default = defaults.openid.provider.token-url; + description = "The URL used to retrieve the token."; + }; + user-url = mkOption { + type = types.nullOr types.str; + default = defaults.openid.provider.user-url; + description = "The URL to the user-info endpoint."; + }; + sign-key = mkOption { + type = types.str; + default = defaults.openid.provider.sign-key; + description = "The key for verifying the jwt signature."; + }; + sig-algo = mkOption { + type = types.str; + default = defaults.openid.provider.sig-algo; + description = "The expected algorithm used to sign the token."; + }; + }; + }); + default = defaults.openid.provider; + description = "The config for an OpenID Connect provider."; + }; + }; + }); + default = [ ]; + description = "A list of OIDC provider configurations."; + }; + + integration-endpoint = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.integration-endpoint.enabled; + description = "Whether the endpoint is globally enabled or disabled."; + }; + priority = mkOption { + type = types.str; + default = defaults.integration-endpoint.priority; + description = "The priority to use when submitting files through this endpoint."; + }; + source-name = mkOption { + type = types.str; + default = defaults.integration-endpoint.source-name; + description = '' + The name used for the item "source" property when uploaded through this endpoint. + ''; + }; + allowed-ips = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.integration-endpoint.allowed-ips.enabled; + description = "Enable/Disable this protection"; + }; + ips = mkOption { + type = types.listOf types.str; + default = defaults.integration-endpoint.allowed-ips.ips; + description = "The ips/ip patterns to allow"; + }; + }; + }); + default = defaults.integration-endpoint.allowed-ips; + description = '' + IPv4 addresses to allow access. An empty list, if enabled, + prohibits all requests. IP addresses may be specified as simple + globs: a part marked as `*' matches any octet, like in + `192.168.*.*`. The `127.0.0.1' (the default) matches the + loopback address. + ''; + }; + http-basic = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.integration-endpoint.http-basic.enabled; + description = "Enable/Disable this protection"; + }; + realm = mkOption { + type = types.str; + default = defaults.integration-endpoint.http-basic.realm; + description = "The realm name to provide to the client."; + }; + user = mkOption { + type = types.str; + default = defaults.integration-endpoint.http-basic.user; + description = "The user name to check."; + }; + password = mkOption { + type = types.str; + default = defaults.integration-endpoint.http-basic.password; + description = "The password to check."; + }; + }; + }); + default = defaults.integration-endpoint.http-basic; + description = '' + Requests are expected to use http basic auth when uploading files. + ''; + }; + http-header = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.integration-endpoint.http-header.enabled; + description = "Enable/Disable this protection"; + }; + header-name = mkOption { + type = types.str; + default = defaults.integration-endpoint.http-header.header-name; + description = "The header to extract from the request."; + }; + header-value = mkOption { + type = types.str; + default = defaults.integration-endpoint.http-basic.header-value; + description = "The value of the header to check."; + }; + }; + }); + default = defaults.integration-endpoint.http-header; + description = '' + Requests are expected to supply some specific header when + uploading files. + ''; + }; + }; + }); + default = defaults.integration-endpoint; + description = '' + This endpoint allows to upload files to any collective. The + intention is that local software integrates with docspell more + easily. Therefore the endpoint is not protected by the usual + means. + + For security reasons, this endpoint is disabled by default. If + enabled, you can choose from some ways to protect it. It may be a + good idea to further protect this endpoint using a firewall, such + that outside traffic is not routed. + + NOTE: If all protection methods are disabled, the endpoint is not + protected at all! + ''; + }; + + admin-endpoint = mkOption { + type = types.submodule ({ + options = { + secret = mkOption { + type = types.str; + default = defaults.admin-endpoint.secret; + description = "The secret used to call admin endpoints."; + }; + }; + }); + default = defaults.admin-endpoint; + description = "An endpoint for administration tasks."; + }; + + full-text-search = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.full-text-search.enabled; + description = '' + The full-text search feature can be disabled. It requires an + additional index server which needs additional memory and disk + space. It can be enabled later any time. + ''; + }; + backend = mkOption { + type = types.str; + default = defaults.full-text-search.backend; + description = "The backend to use, either solr or postgresql"; + }; + solr = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.full-text-search.solr.url; + description = "The URL to solr"; + }; + commit-within = mkOption { + type = types.int; + default = defaults.full-text-search.solr.commit-within; + description = "Used to tell solr when to commit the data"; + }; + log-verbose = mkOption { + type = types.bool; + default = defaults.full-text-search.solr.log-verbose; + description = "If true, logs request and response bodies"; + }; + def-type = mkOption { + type = types.str; + default = defaults.full-text-search.solr.def-type; + description = '' + The defType parameter to lucene that defines the parser to + use. You might want to try "edismax" or look here: + https://solr.apache.org/guide/8_4/query-syntax-and-parsing.html#query-syntax-and-parsing + ''; + }; + q-op = mkOption { + type = types.str; + default = defaults.full-text-search.solr.q-op; + description = "The default combiner for tokens. One of {AND, OR}."; + }; + }; + }); + default = defaults.full-text-search.solr; + description = "Configuration for the SOLR backend."; + }; + + postgresql = mkOption { + type = types.submodule ({ + options = { + use-default-connection = mkOption { + type = types.bool; + default = defaults.full-text-search.postgresql.use-default-connection; + description = "Whether to use the primary db connection."; + }; + jdbc = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.jdbc.url; + description = '' + The URL to the database. + ''; + }; + user = mkOption { + type = types.str; + default = defaults.jdbc.user; + description = "The user name to connect to the database."; + }; + password = mkOption { + type = types.str; + default = defaults.jdbc.password; + description = "The password to connect to the database."; + }; + }; + }); + default = defaults.full-text-search.postgresql.jdbc; + description = "Database connection settings"; + }; + pg-config = mkOption { + type = types.attrs; + default = defaults.full-text-search.postgresql.pg-config; + description = ""; + }; + pg-query-parser = mkOption { + type = types.str; + default = defaults.full-text-search.postgresql.pg-query-parser; + description = ""; + }; + pg-rank-normalization = mkOption { + type = types.listOf types.int; + default = defaults.full-text-search.postgresql.pg-rank-normalization; + description = ""; + }; + }; + }); + default = defaults.full-text-search.postgresql; + description = "PostgreSQL for fulltext search"; + }; + }; + }); + default = defaults.full-text-search; + description = "Configuration for full-text search."; + }; + + backend = mkOption { + type = types.submodule ({ + options = { + mail-debug = mkOption { + type = types.bool; + default = defaults.backend.mail-debug; + description = '' + Enable or disable debugging for e-mail related functionality. This + applies to both sending and receiving mails. For security reasons + logging is not very extensive on authentication failures. Setting + this to true, results in a lot of data printed to stdout. + ''; + }; + jdbc = mkOption { + type = types.submodule ({ + options = { + url = mkOption { + type = types.str; + default = defaults.backend.jdbc.url; + description = '' + The URL to the database. By default a file-based database is + used. It should also work with mariadb and postgresql. + + Examples: + "jdbc:mariadb://192.168.1.172:3306/docspell" + "jdbc:postgresql://localhost:5432/docspell" + "jdbc:h2:///home/dbs/docspell.db;MODE=PostgreSQL;DATABASE_TO_LOWER=TRUE;AUTO_SERVER=TRUE" + + ''; + }; + user = mkOption { + type = types.str; + default = defaults.backend.jdbc.user; + description = "The user name to connect to the database."; + }; + password = mkOption { + type = types.str; + default = defaults.backend.jdbc.password; + description = "The password to connect to the database."; + }; + }; + }); + default = defaults.backend.jdbc; + description = "Database connection settings"; + }; + signup = mkOption { + type = types.submodule ({ + options = { + mode = mkOption { + type = types.str; + default = defaults.backend.signup.mode; + description = '' + The mode defines if new users can signup or not. It can have + three values: + + - open: every new user can sign up + - invite: new users can sign up only if they provide a correct + invitation key. Invitation keys can be generated by the + server. + - closed: signing up is disabled. + ''; + }; + new-invite-password = mkOption { + type = types.str; + default = defaults.backend.signup.new-invite-password; + description = '' + If mode == 'invite', a password must be provided to generate + invitation keys. It must not be empty. + ''; + }; + invite-time = mkOption { + type = types.str; + default = defaults.backend.signup.invite-time; + description = '' + If mode == 'invite', this is the period an invitation token is + considered valid. + ''; + }; + }; + }); + default = defaults.backend.signup; + description = "Registration settings"; + }; + files = mkOption { + type = types.submodule ({ + options = { + chunk-size = mkOption { + type = types.int; + default = defaults.backend.files.chunk-size; + description = '' + Defines the chunk size (in bytes) used to store the files. + This will affect the memory footprint when uploading and + downloading files. At most this amount is loaded into RAM for + down- and uploading. + + It also defines the chunk size used for the blobs inside the + database. + ''; + }; + valid-mime-types = mkOption { + type = types.listOf types.str; + default = defaults.backend.files.valid-mime-types; + description = '' + The file content types that are considered valid. Docspell + will only pass these files to processing. The processing code + itself has also checks for which files are supported and which + not. This affects the uploading part and is a first check to + avoid that 'bad' files get into the system. + ''; + }; + }; + }); + default = defaults.backend.files; + description = "Settings for how files are stored."; + }; + addons = mkOption { + type = types.submodule ({ + options = { + enabled = mkOption { + type = types.bool; + default = defaults.backend.addons.enabled; + description = "Enable this feature"; + }; + allow-impure = mkOption { + type = types.bool; + default = defaults.backend.addons.allow-impure; + description = "Allow impure addons"; + }; + allowed-urls = mkOption { + type = types.listOf types.str; + default = defaults.backend.addons.allowed-urls; + description = "Url patterns of addons to be allowed"; + }; + denied-urls = mkOption { + type = types.listOf types.str; + default = defaults.backend.addons.denied-urls; + description = "Url patterns to deny to install"; + }; + }; + }); + default = defaults.backend.addons; + description = "Addon config"; + }; + }; + }); + default = defaults.backend; + description = "Configuration for the backend"; + }; + }; + }; + + ## implementation + config = mkIf config.services.docspell-restserver.enable { + + nixpkgs.overlays = [ overlay ]; + users.users."${user}" = mkIf (cfg.runAs == null) { + name = user; + isSystemUser = true; + createHome = true; + home = "/var/docspell"; + description = "Docspell user"; + group = user; + }; + users.groups."${user}" = mkIf (cfg.runAs == null) { }; + + + systemd.services.docspell-restserver = + let + args = builtins.concatStringsSep " " cfg.jvmArgs; + cmd = "${pkgs.docspell-server}/bin/docspell-restserver ${args} -- ${configFile}"; + in + { + description = "Docspell Rest Server"; + after = [ "networking.target" ]; + wantedBy = [ "multi-user.target" ]; + path = [ pkgs.gawk ]; + preStart = '' + ''; + + script = + "${pkgs.su}/bin/su -s ${pkgs.bash}/bin/sh ${user} -c \"${cmd}\""; + }; + }; +}