-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutil_search.ml
105 lines (93 loc) · 2.37 KB
/
util_search.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
(*
Naive text search.
*)
open Printf
type query = string
(* Words that must exist in the document, in any order. *)
(* Regexps that must match the document.
whole_query or words must match. *)
type compiled_query = {
whole_query: Pcre.regexp option;
words: Pcre.regexp list option;
}
(*
A case-insensitive match on an arbitrary string not followed or preceded
by ascii word characters.
*)
let make_word_regexp w =
let pattern =
sprintf
"(?<![A-Za-z_])%s(?![A-Za-z_])"
(Pcre.quote w)
in
Pcre.regexp ~flags:[`CASELESS] pattern
let ascii_word_pattern = "[A-Za-z_]+"
let ascii_word_regexp = Pcre.regexp ascii_word_pattern
let extract_query_words s =
try
let aa = Pcre.extract_all ~rex:ascii_word_regexp s in
let a =
Array.map (function
| [| sub |] -> sub
| _ -> assert false
) aa
in
Array.to_list a
with Not_found ->
[]
(*
Compile a query given as a string by the end-user.
*)
let compile (query : query) : compiled_query =
let query = Util_string.compact_whitespace query in
let ascii_words = extract_query_words query in
let norm_query = String.lowercase_ascii query in
let norm_words =
Util_list.unique (BatList.map String.lowercase_ascii ascii_words)
in
let whole_query =
if not (BatList.mem norm_query norm_words) then
Some (make_word_regexp norm_query)
else
None
in
let word_regexps =
match norm_words with
| [] -> None
| l -> Some (BatList.map make_word_regexp l)
in
{ whole_query; words = word_regexps }
(*
Return whether a document matches the query.
*)
let matches compiled_query doc =
(match compiled_query.whole_query with
| None -> false
| Some rex -> Pcre.pmatch ~rex doc
)
||
(match compiled_query.words with
| None -> false
| Some l ->
BatList.for_all (fun rex -> Pcre.pmatch ~rex doc) l
)
let test_search () =
let yes query doc = matches (compile query) doc in
let no query doc = not (yes query doc) in
assert (no "" "abc");
assert (yes "" "");
assert (no "-" "a-b");
assert (yes "-" "a - b");
assert (yes "-" "-");
assert (no "a-b" "--b");
assert (yes "ab" "c-ab-c");
assert (no "ab" "abc");
assert (yes "ab cd" "cd-ab");
assert (yes "ab-cd" "cd ab");
assert (yes "dôme" "dôme");
assert (yes "dôme" "meôd");
assert (no "dôme" "domestic");
true
let tests = [
"search", test_search;
]