Skip to content

Commit

Permalink
+hhash
Browse files Browse the repository at this point in the history
  • Loading branch information
mk-fg committed Jan 18, 2020
1 parent 76ce457 commit a207759
Show file tree
Hide file tree
Showing 4 changed files with 274 additions and 12 deletions.
34 changes: 34 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1305,6 +1305,40 @@ Other options allow for picking number of words and sanity-checks like min/max l

.. _Diceware-like: https://en.wikipedia.org/wiki/Diceware

hhash
'''''

Produces lower-entropy "human hash" phrase consisting of aspell english
dictionary words for input arg(s) or data on stdin.

It works by first calculating BLAKE2 hash of input string/data via libsodium_,
and then encoding it using consistent word-alphabet, exactly like something like
base32 or base64 does.

Example::

% hhash -e AAAAC3NzaC1lZDI1NTE5AAAAIPh5/VmxDwgtJI0HiFBqZkbyV1I1YK+2DVjGjYydNp5o
allan avenues regrade windups flours
entropy-stats: word-count=5 dict-words=126643 word-bits=17.0 total-bits=84.8

Here -e is used to print entropy estimate for produced words.

Note that resulting entropy values can be fractional if word-alphabet ends up
being padded to map exactly to N bits (e.g. 17 bits above), so that words in it
can be repeated, hence not exactly 17 bits of distinct values.

Written in OCAML, linked against libsodium_ (for BLAKE2 hash function) via small
C glue code, build with::

% ocamlc -c hhash_glue.c
% ocamlopt -o hhash -O2 unix.cmxa str.cmxa hhash_glue.o -cclib -lsodium hhash.ml
% strip hhash

Caches dictionary into a ~/.cache/hhash.dict (-c option) on first run to produce
consistent results on this machine. Updating that dictionary will change outputs!

.. _libsodium: https://libsodium.org/

urlparse
''''''''

Expand Down
24 changes: 12 additions & 12 deletions desktop/uri_handlers/magnet_relay_transmission.ml
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,14 @@ let () =
("-s", Arg.Set_string cli_path_suffix,
"-- Suffix for files to be processed." ^
" Can be empty to process all.\n" ^
" Case-sensitive. Default: " ^ !cli_path_suffix );
" Case-sensitive. Default: " ^ !cli_path_suffix);
("-d", Arg.Set cli_debug, " " ^ debug_desc);
("--debug", Arg.Set cli_debug, debug_desc) ]
( fun arg ->
(fun arg ->
if !cli_path_specified
then raise (Arg.Bad ("Bogus extra arg : " ^ arg))
else ( cli_path := arg; cli_path_specified := true ) )
("Usage: " ^ Sys.argv.(0) ^ " [-d|--debug] [path]\
else (cli_path := arg; cli_path_specified := true))
("Usage: " ^ Sys.argv.(0) ^ " [-d|--debug] [opts] [path]\
\n\nWatch path for new .magnet files and run transmission-remote with link from each one.\
\nUses current directory if path is not specified.\n")

Expand All @@ -57,7 +57,7 @@ external in_ev_name : bytes -> int -> int -> bytes = "mlin_ev_name"
let in_read_paths =
(* Note: buffer doesn't persist between reads, assuming that events are read all-or-nothing *)
let hdr_len = in_hdr_len () in
( fun fd ->
(fun fd ->
let buff_len = in_peek fd in
let buff = Bytes.make buff_len '\000' in
let buff_len = Unix.read fd buff 0 buff_len in
Expand All @@ -66,7 +66,7 @@ let in_read_paths =
if m >= buff_len then path_list else
let ev_name = in_ev_name buff buff_len n in
parse_ev (m + Bytes.length ev_name) (ev_name :: path_list) in
List.rev (parse_ev 0 []) )
List.rev (parse_ev 0 []))


let watch_path () =
Expand Down Expand Up @@ -116,21 +116,21 @@ let watch_path () =
cmd_pids := (if !cli_debug
then cmd_func Unix.stdin Unix.stdout Unix.stderr
else cmd_func cmd_pipe cmd_pipe cmd_pipe) :: !cmd_pids;
debug_print ( Printf.sprintf
debug_print (Printf.sprintf
"--- - new-pid=%d [path-q=%d cmd-q=%d]: %s"
(List.hd !cmd_pids) (List.length !path_queue)
(List.length !cmd_pids) path ); in
(List.length !cmd_pids) path); in
(* XXX: cleanup link-files if command exits with 0 *)
(* XXX: log non-clean pid exits to stderr for systemd *)

let rec cmd_check () =
cmd_check_needed := false;
cmd_pids := List.filter
( fun pid ->
(fun pid ->
let pid, status =
try Unix.waitpid [Unix.WNOHANG] pid
with Unix.Unix_error (Unix.ECHILD, _, _) -> (1, Unix.WEXITED 0)
in pid == 0 )
in pid == 0)
!cmd_pids;
cmd_spawn ();
if !cmd_check_needed then cmd_check () else
Expand All @@ -153,7 +153,7 @@ let watch_path () =
if List.length x == 0 && List.length r >= 1 then
let path_list = in_read_paths fd in
List.iter
( fun path ->
(fun path ->
let path = Bytes.to_string path in
let path_match =
try ignore (Str.search_forward re_path path 0); true
Expand All @@ -162,7 +162,7 @@ let watch_path () =
debug_print (Printf.sprintf "--- file: %s" path);
path_queue := !path_queue @ [path]
) else debug_print (Printf.sprintf "--- file [skip]: %s" path);
cmd_check () )
cmd_check ())
path_list;
ev_process () in
ev_process () in
Expand Down
168 changes: 168 additions & 0 deletions hhash.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
(* Command-line tool to hash strings to words from cached aspell dictionary.
*
* Build with:
* % ocamlc -c hhash_glue.c
* % ocamlopt -o hhash -O2 unix.cmxa str.cmxa hhash_glue.o -cclib -lsodium hhash.ml
* % strip hhash
*
* Usage:
* % ./hhash some-fingerprint other-fp-string
* % ./hhash -e <<< file-contents
*)

let cli_dict_cmd = ref "/usr/bin/aspell -d en dump master"
let cli_strip_re = ref "'s$"
let cli_cache_file = ref "~/.cache/hhash.dict"
let cli_entropy_est = ref false
let cli_strings = ref []
let cli_word_count = ref 5
let cli_dict_words_max = ref (int_of_float ((2. ** 30.) /. 10.)) (* ~1 GiB of ~10-char words *)

exception HHash_fail of string


(* Command-line args processing *)
let () =
Arg.parse
[ ("-w", Arg.Set_int cli_word_count,
"-- Output word count. Default: " ^ (string_of_int !cli_word_count));
("-d", Arg.Set_string cli_dict_cmd,
"-- Dict-dump-command to run to get list of hash-component words on stdout.\n" ^
" Command will be split on spaces. Default: " ^ !cli_dict_cmd ^ "\n" ^
" Output words are split on spaces/newlines, will be sorted, with duplicates removed.\n");
("-s", Arg.Set_string cli_strip_re,
"-- Strip specified regexp-match(es) from each input word. Default: " ^ !cli_strip_re);
("-c", Arg.Set_string cli_cache_file,
"-- File to store aspell cache into. Should be persistent for consistent outputs.\n" ^
" Can be empty to re-run dict-dump cmd every time. Default: " ^ !cli_cache_file);
("-e", Arg.Set cli_entropy_est,
"-- Print entropy estimate for resulting hash value.") ]
(fun arg -> cli_strings := arg :: !cli_strings)
("Usage: " ^ Sys.argv.(0) ^ " [opts] [string ...]\
\n\nOutput word-hashes for each specified string(s) (same order, one per line), or use stdin if none are specified.\
\nSame idea as in e.g. base32 or base64, but with \"alphabet\" being word-dictionary.\"
\nThis is NOT cryptographic hash (wrt entropy, dsitribution, etc), and should not be used as such.\n")


(* Build word_arr alphabet *)
let word_count, word_bits, word_arr =
let cache_file = if Str.string_match (Str.regexp "^~/\\(.*\\)$") !cli_cache_file 0
then Sys.getenv "HOME" ^ "/" ^ (Str.matched_group 1 !cli_cache_file) else !cli_cache_file in

(* Lookup binary in PATH for Unix.open_process_args_in, if necessary *)
let dict_cmd = (String.split_on_char ' ' (String.trim !cli_dict_cmd)) in
let dict_cmd_bin = List.hd dict_cmd in
let dict_cmd = if dict_cmd_bin.[0] = '/' then dict_cmd else
let rec find_in_path bin path =
if (List.length path) = 0
then raise (Sys_error (Printf.sprintf "Failed to find binary in PATH: %s" dict_cmd_bin))
else let bin_abs = Filename.concat (List.hd path) bin in
if bin_abs.[0] = '/' && Sys.file_exists bin_abs
then bin_abs else find_in_path bin (List.tl path) in
(find_in_path dict_cmd_bin (String.split_on_char ':' (Sys.getenv "PATH"))) :: (List.tl dict_cmd) in

(* Open cache-file or command output *)
(* src_is_proc is used for closeing and checked for whether to create cache-file later *)
let src, src_is_proc =
try (open_in cache_file, false)
with Sys_error err ->
if Str.string_match (Str.regexp "^.*: No such file or directory$") err 0
then (Unix.open_process_args_in (List.hd dict_cmd) (Array.of_list dict_cmd), true)
else raise (Sys_error err) in

(* Read words from command output or cache-file, sort/dedup *)
let words =
List.sort_uniq (fun a b -> if a == b then 0 else if a < b then -1 else 1)
(let strip_re = Str.regexp !cli_strip_re in
let rec read_lines list n = if n <= 0
then raise (HHash_fail (Printf.sprintf "Too many input words (max=%d)" !cli_dict_words_max))
else
let list =
let words = String.split_on_char ' ' (input_line src) in
(List.map (fun w -> Str.global_replace
strip_re "" (String.lowercase_ascii w)) words) @ list in
try read_lines list (n - 1)
with End_of_file -> if not src_is_proc then (close_in src; list) else
match Unix.close_process_in src with
Unix.WEXITED code -> if code = 0 then list else
raise (HHash_fail (Printf.sprintf "Input subprocess failed (exit-code=%d)" code))
| Unix.WSIGNALED code | Unix.WSTOPPED code ->
raise (HHash_fail (Printf.sprintf "Input subprocess killed (signal=%d)" code)) in
read_lines [] !cli_dict_words_max) in

(* Write resulting words to cache-file *)
let () = if not src_is_proc then () else
let cache = open_out cache_file in
List.iter (fun w -> output_string cache (w ^ "\n")) words;
close_out cache in

(* Pad words from the beginning to binary-power size, if it's almost there, truncate otherwise *)
(* This potentially makes words non-unique *)
let n = List.length words in
let n_bits = (log (float_of_int n)) /. (log 2.) in
let words =
let n_floor = 2. ** (floor n_bits) in
let n_rem = (float_of_int n) -. n_floor in
let rec sublist a b list =
match list with [] -> failwith "sublist" | e :: tail ->
let tail = if b <= 1 then [] else sublist (a-1) (b-1) tail in
if a > 0 then tail else e :: tail in
if (n_rem /. n_floor) < 0.7
then sublist 0 (int_of_float n_floor) words
else words @ (sublist 0 (int_of_float (n_floor -. n_rem)) words) in

let n_bits = if (List.length words) < n then (floor n_bits) else n_bits in
(n, n_bits, (Array.of_list words))


let read_byte_iter_func s =
let n = ref 0 in let n_max = Bytes.length s in
(fun () -> if !n < n_max
then (n := !n + 1; Bytes.get s (!n-1)) else raise End_of_file)

let hash_to_words read_byte =
let n_bits = int_of_float (* n_bits here will be int with padded array *)
((log (float_of_int (Array.length word_arr))) /. (log 2.)) in
let rec read_input hash n bits =
let b =
try int_of_char (read_byte ())
with End_of_file -> -1 in
if b < 0 then
if bits = 0 || bits = n_bits then hash else (n :: hash)
else
let shift = min 8 bits in (* n of bits from b to current n *)
let rem = 8 - shift in
(* Printf.printf "input[%d] = %x [n=%x bits=%d shift=%d rem=%d]\n"
* (List.length hash) b n bits shift rem; *)
let n = (Int.shift_left n shift) lor (Int.shift_right b rem) in (* extend n by "shift" bits *)
let b = b land (int_of_float (2. ** (float_of_int rem)) - 1) in (* remaining unused bits *)
let bits = bits - shift in
if bits = 0
then read_input (n :: hash) b (n_bits - rem)
else read_input hash n bits in
let hash = List.tl (read_input [] 0 n_bits) in (* always drop final word *)
List.map (fun n -> Array.get word_arr n) hash

let hash_str read_byte = String.concat " " (hash_to_words read_byte)
let hash_print read_byte = Printf.printf "%s\n%!" (hash_str read_byte)


external hash_raw : string -> int -> bytes = "mls_hash_string"
external hash_raw_stdin : int -> bytes = "mls_hash_stdin"

let () =
let n = float_of_int (Array.length word_arr) in
let n_bits = int_of_float ((log n) /. (log 2.)) in
let hash_bits = !cli_word_count * n_bits in
(* hash_len gets +1 because last incomplete word is always dropped *)
let hash_len = (int_of_float (floor ((float_of_int hash_bits) /. 8.))) + 1 in
if (List.length !cli_strings) > 0
then List.iter (fun s ->
let hash = hash_raw s hash_len in
hash_print (read_byte_iter_func hash)) !cli_strings
else hash_print (read_byte_iter_func (hash_raw_stdin hash_len))

let () =
if not !cli_entropy_est then () else Printf.printf
"entropy-stats: word-count=%d dict-words=%d word-bits=%.1f total-bits=%.1f\n"
!cli_word_count word_count word_bits ((float_of_int !cli_word_count) *. word_bits)
60 changes: 60 additions & 0 deletions hhash_glue.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include <string.h>
#include <stdio.h>
#include <unistd.h>

#include <caml/mlvalues.h>
#include <caml/memory.h>
#include <caml/alloc.h>
#include <caml/fail.h>

// Note: int return values of crypto_generichash funcs seem to be undocumented
#include <sodium.h>


char *key = "hhash.1";
int key_len = 7;

int block_size = 2^16;


value mls_hash_string(value v_str, value v_hash_len) {
CAMLparam2(v_str, v_hash_len);
if (sodium_init() < 0) caml_failwith("sodium_init failed");

int hash_len = Int_val(v_hash_len);
if (!hash_len) hash_len = crypto_generichash_BYTES;
value v_bs = caml_alloc_string(hash_len);
char *hash = Bytes_val(v_bs);

(void) crypto_generichash( hash, hash_len,
Bytes_val(v_str), caml_string_length(v_str), key, key_len );

// for (int n=0; n < hash_len; n++) hash[n] = '\xff';
CAMLreturn(v_bs);
}


value mls_hash_stdin(value v_hash_len) {
CAMLparam1(v_hash_len);
if (sodium_init() < 0) caml_failwith("sodium_init failed");

int hash_len = Int_val(v_hash_len);
if (!hash_len) hash_len = crypto_generichash_BYTES;
value v_bs = caml_alloc_string(hash_len);
char *hash = Bytes_val(v_bs);

crypto_generichash_state state;
(void) crypto_generichash_init(&state, key, key_len, hash_len);

int res;
char *block = Bytes_val(caml_alloc_string(block_size));
while (1) {
res = (int) read(0, block, block_size);
block[res] = '\0';
if (!res) break;
(void) crypto_generichash_update(&state, block, res); }
(void) crypto_generichash_final(&state, hash, hash_len);

// for (int n=0; n < hash_len; n++) hash[n] = '\xff';
CAMLreturn(v_bs);
}

0 comments on commit a207759

Please sign in to comment.