From 7636ba84e19ffa649bc88bd73a86c3068ba56b3c Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Tue, 2 May 2023 08:31:00 -0400 Subject: [PATCH] *: lots of polish This commit grew into a monster. I ran out of energy trying to split everything up. For the most part, this commit is about polishing and writing docs. --- .github/workflows/ci.yml | 2 + Cargo.toml | 36 +- PERFORMANCE.md | 277 -- README.md | 132 +- UNICODE.md | 13 +- examples/regexdna-input.txt | 1671 ----------- examples/regexdna-output.txt | 13 - examples/shootout-regex-dna-bytes.rs | 68 - examples/shootout-regex-dna-cheat.rs | 90 - examples/shootout-regex-dna-single-cheat.rs | 75 - examples/shootout-regex-dna-single.rs | 57 - examples/shootout-regex-dna.rs | 68 - fuzz/fuzz_targets/ast_fuzz_match.rs | 9 +- fuzz/fuzz_targets/ast_fuzz_match_bytes.rs | 8 +- ...zz_regex_automata_deserialize_dense_dfa.rs | 6 +- ...z_regex_automata_deserialize_sparse_dfa.rs | 6 +- fuzz/fuzz_targets/fuzz_regex_lite_match.rs | 7 +- fuzz/fuzz_targets/fuzz_regex_match.rs | 8 +- regex-automata/Cargo.toml | 5 +- regex-automata/README.md | 172 +- regex-automata/src/dfa/automaton.rs | 39 +- regex-automata/src/dfa/dense.rs | 23 +- regex-automata/src/dfa/mod.rs | 4 +- regex-automata/src/dfa/onepass.rs | 99 +- regex-automata/src/dfa/search.rs | 54 +- regex-automata/src/dfa/sparse.rs | 10 +- regex-automata/src/hybrid/dfa.rs | 86 +- regex-automata/src/hybrid/id.rs | 5 +- regex-automata/src/hybrid/search.rs | 46 +- regex-automata/src/lib.rs | 549 +++- regex-automata/src/meta/limited.rs | 10 +- regex-automata/src/meta/regex.rs | 74 +- regex-automata/src/meta/stopat.rs | 10 +- regex-automata/src/meta/strategy.rs | 110 +- regex-automata/src/nfa/thompson/backtrack.rs | 2 +- regex-automata/src/nfa/thompson/nfa.rs | 19 +- regex-automata/src/nfa/thompson/range_trie.rs | 54 +- regex-automata/src/util/captures.rs | 27 +- regex-automata/src/util/escape.rs | 25 +- regex-automata/src/util/interpolate.rs | 8 +- regex-automata/src/util/iter.rs | 40 + regex-automata/src/util/lazy.rs | 6 +- regex-automata/src/util/mod.rs | 3 +- regex-automata/src/util/pool.rs | 6 +- regex-automata/src/util/prefilter/mod.rs | 5 + regex-automata/src/util/search.rs | 2 +- regex-automata/src/util/syntax.rs | 34 + regex-automata/tests/dfa/onepass/suite.rs | 1 + regex-automata/tests/dfa/suite.rs | 1 + regex-automata/tests/hybrid/suite.rs | 1 + regex-automata/tests/meta/suite.rs | 1 + .../tests/nfa/thompson/backtrack/suite.rs | 1 + .../tests/nfa/thompson/pikevm/suite.rs | 1 + regex-capi/src/rure.rs | 12 +- regex-cli/README.md | 265 ++ regex-cli/args/meta.rs | 52 +- regex-cli/cmd/debug/dfa.rs | 20 +- regex-cli/cmd/debug/literal.rs | 4 +- regex-cli/cmd/debug/mod.rs | 21 +- regex-cli/cmd/find/capture/mod.rs | 20 +- regex-cli/cmd/find/half/mod.rs | 20 +- regex-cli/cmd/find/match/mod.rs | 20 +- regex-cli/cmd/find/which/mod.rs | 20 +- regex-cli/cmd/generate/serialize/dfa.rs | 7 + regex-lite/Cargo.toml | 10 +- regex-lite/README.md | 130 +- regex-lite/src/error.rs | 9 + regex-lite/src/hir/mod.rs | 122 +- regex-lite/src/hir/parse.rs | 8 +- regex-lite/src/interpolate.rs | 8 +- regex-lite/src/lib.rs | 849 +++++- regex-lite/src/nfa.rs | 21 + regex-lite/src/string.rs | 2441 +++++++++++++--- regex-lite/src/utf8.rs | 4 +- regex-lite/tests/lib.rs | 2 - regex-syntax/README.md | 1 - regex-syntax/src/hir/literal.rs | 47 +- regex-syntax/src/hir/mod.rs | 49 +- regex-syntax/src/hir/print.rs | 31 +- regex-syntax/src/hir/translate.rs | 91 +- regex-syntax/src/lib.rs | 12 + regex-syntax/src/parser.rs | 25 + src/builders.rs | 2525 ++++++++++++++++ src/bytes.rs | 91 + src/error.rs | 11 +- src/lib.rs | 1275 +++++--- src/pattern.rs | 4 +- src/re_builder.rs | 434 --- src/re_bytes.rs | 1394 --------- src/re_set.rs | 519 ---- src/re_unicode.rs | 1406 --------- src/regex/bytes.rs | 2579 +++++++++++++++++ src/regex/mod.rs | 2 + src/regex/string.rs | 2561 ++++++++++++++++ src/regexset/bytes.rs | 710 +++++ src/regexset/mod.rs | 2 + src/regexset/string.rs | 706 +++++ test | 8 + testdata/crazy.toml | 2 - testdata/line-terminator.toml | 11 + testdata/regression.toml | 11 + tests/lib.rs | 5 +- tests/misc.rs | 5 +- tests/regression.rs | 3 +- tests/suite_bytes.rs | 1 + tests/suite_bytes_set.rs | 1 + tests/suite_string.rs | 1 + tests/suite_string_set.rs | 1 + 108 files changed, 15243 insertions(+), 7425 deletions(-) delete mode 100644 PERFORMANCE.md delete mode 100644 examples/regexdna-input.txt delete mode 100644 examples/regexdna-output.txt delete mode 100644 examples/shootout-regex-dna-bytes.rs delete mode 100644 examples/shootout-regex-dna-cheat.rs delete mode 100644 examples/shootout-regex-dna-single-cheat.rs delete mode 100644 examples/shootout-regex-dna-single.rs delete mode 100644 examples/shootout-regex-dna.rs create mode 100644 regex-cli/README.md create mode 100644 src/builders.rs create mode 100644 src/bytes.rs delete mode 100644 src/re_builder.rs delete mode 100644 src/re_bytes.rs delete mode 100644 src/re_set.rs delete mode 100644 src/re_unicode.rs create mode 100644 src/regex/bytes.rs create mode 100644 src/regex/mod.rs create mode 100644 src/regex/string.rs create mode 100644 src/regexset/bytes.rs create mode 100644 src/regexset/mod.rs create mode 100644 src/regexset/string.rs diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 366382d6a9..25df2b3014 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -124,6 +124,8 @@ jobs: run: ${{ env.CARGO }} test --verbose --manifest-path regex-automata/Cargo.toml $TARGET - name: Run regex-lite tests run: ${{ env.CARGO }} test --verbose --manifest-path regex-lite/Cargo.toml $TARGET + - name: Run regex-cli tests + run: ${{ env.CARGO }} test --verbose --manifest-path regex-cli/Cargo.toml $TARGET # This job runs a stripped down version of CI to test the MSRV. The specific # reason for doing this is that the regex crate's dev-dependencies tend to diff --git a/Cargo.toml b/Cargo.toml index ce337b4b55..2527e885b0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,6 +45,15 @@ std = [ "regex-automata/std", "regex-syntax/std", ] +# This feature enables the 'log' crate to emit messages. This is usually +# only useful for folks working on the regex crate itself, but can be useful +# if you're trying hard to do some performance hacking on regex patterns +# themselves. Note that you'll need to pair this with a crate like 'env_logger' +# to actually emit the log messages somewhere. +logging = [ + "aho-corasick?/logging", + "regex-automata/logging", +] # The 'use_std' feature is DEPRECATED. It will be removed in regex 2. Until # then, it is an alias for the 'std' feature. use_std = ["std"] @@ -64,11 +73,6 @@ perf = [ "perf-inline", "perf-literal", ] -# Enables fast caching. (If disabled, caching is still used, but is slower.) -# Currently, this feature has no effect. It used to remove the thread_local -# dependency and use a slower internal cache, but now the default cache has -# been improved and thread_local is no longer a dependency at all. -perf-cache = [] # Enables use of a lazy DFA when possible. perf-dfa = ["regex-automata/hybrid"] # Enables use of a fully compiled DFA when possible. @@ -86,6 +90,11 @@ perf-literal = [ "dep:memchr", "regex-automata/perf-literal", ] +# Enables fast caching. (If disabled, caching is still used, but is slower.) +# Currently, this feature has no effect. It used to remove the thread_local +# dependency and use a slower internal cache, but now the default cache has +# been improved and thread_local is no longer a dependency at all. +perf-cache = [] # UNICODE DATA FEATURES @@ -151,7 +160,7 @@ unstable = ["pattern"] # by default if the unstable feature is enabled. pattern = [] -# For very fast prefix literal matching. +# For very fast multi-prefix literal matching. [dependencies.aho-corasick] version = "1.0.0" optional = true @@ -160,13 +169,6 @@ optional = true [dependencies.memchr] version = "2.5.0" optional = true - -# For parsing regular expressions. -[dependencies.regex-syntax] -path = "regex-syntax" -version = "0.7.2" -default-features = false - # For the actual regex engines. [dependencies.regex-automata] path = "regex-automata" @@ -174,9 +176,15 @@ version = "0.3.0" default-features = false features = ["alloc", "syntax", "meta", "nfa-pikevm"] +# For parsing regular expressions. +[dependencies.regex-syntax] +path = "regex-syntax" +version = "0.7.2" +default-features = false + [dev-dependencies] # For examples. -lazy_static = "1" +once_cell = "1.17.1" # For property based tests. quickcheck = { version = "1.0.3", default-features = false } # To check README's example diff --git a/PERFORMANCE.md b/PERFORMANCE.md deleted file mode 100644 index 8cd0d9c719..0000000000 --- a/PERFORMANCE.md +++ /dev/null @@ -1,277 +0,0 @@ -Your friendly guide to understanding the performance characteristics of this -crate. - -This guide assumes some familiarity with the public API of this crate, which -can be found here: https://docs.rs/regex - -## Theory vs. Practice - -One of the design goals of this crate is to provide worst case linear time -behavior with respect to the text searched using finite state automata. This -means that, *in theory*, the performance of this crate is much better than most -regex implementations, which typically use backtracking which has worst case -exponential time. - -For example, try opening a Python interpreter and typing this: - - >>> import re - >>> re.search('(a*)*c', 'a' * 30).span() - -I'll wait. - -At some point, you'll figure out that it won't terminate any time soon. ^C it. - -The promise of this crate is that *this pathological behavior can't happen*. - -With that said, just because we have protected ourselves against worst case -exponential behavior doesn't mean we are immune from large constant factors -or places where the current regex engine isn't quite optimal. This guide will -detail those cases and provide guidance on how to avoid them, among other -bits of general advice. - -## Thou Shalt Not Compile Regular Expressions In A Loop - -**Advice**: Use `lazy_static` to amortize the cost of `Regex` compilation. - -Don't do it unless you really don't mind paying for it. Compiling a regular -expression in this crate is quite expensive. It is conceivable that it may get -faster some day, but I wouldn't hold out hope for, say, an order of magnitude -improvement. In particular, compilation can take any where from a few dozen -microseconds to a few dozen milliseconds. Yes, milliseconds. Unicode character -classes, in particular, have the largest impact on compilation performance. At -the time of writing, for example, `\pL{100}` takes around 44ms to compile. This -is because `\pL` corresponds to every letter in Unicode and compilation must -turn it into a proper automaton that decodes a subset of UTF-8 which -corresponds to those letters. Compilation also spends some cycles shrinking the -size of the automaton. - -This means that in order to realize efficient regex matching, one must -*amortize the cost of compilation*. Trivially, if a call to `is_match` is -inside a loop, then make sure your call to `Regex::new` is *outside* that loop. - -In many programming languages, regular expressions can be conveniently defined -and compiled in a global scope, and code can reach out and use them as if -they were global static variables. In Rust, there is really no concept of -life-before-main, and therefore, one cannot utter this: - - static MY_REGEX: Regex = Regex::new("...").unwrap(); - -Unfortunately, this would seem to imply that one must pass `Regex` objects -around to everywhere they are used, which can be especially painful depending -on how your program is structured. Thankfully, the -[`lazy_static`](https://crates.io/crates/lazy_static) -crate provides an answer that works well: - - use lazy_static::lazy_static; - use regex::Regex; - - fn some_helper_function(text: &str) -> bool { - lazy_static! { - static ref MY_REGEX: Regex = Regex::new("...").unwrap(); - } - MY_REGEX.is_match(text) - } - -In other words, the `lazy_static!` macro enables us to define a `Regex` *as if* -it were a global static value. What is actually happening under the covers is -that the code inside the macro (i.e., `Regex::new(...)`) is run on *first use* -of `MY_REGEX` via a `Deref` impl. The implementation is admittedly magical, but -it's self contained and everything works exactly as you expect. In particular, -`MY_REGEX` can be used from multiple threads without wrapping it in an `Arc` or -a `Mutex`. On that note... - -## Using a regex from multiple threads - -**Advice**: The performance impact from using a `Regex` from multiple threads -is likely negligible. If necessary, clone the `Regex` so that each thread gets -its own copy. Cloning a regex does not incur any additional memory overhead -than what would be used by using a `Regex` from multiple threads -simultaneously. *Its only cost is ergonomics.* - -It is supported and encouraged to define your regexes using `lazy_static!` as -if they were global static values, and then use them to search text from -multiple threads simultaneously. - -One might imagine that this is possible because a `Regex` represents a -*compiled* program, so that any allocation or mutation is already done, and is -therefore read-only. Unfortunately, this is not true. Each type of search -strategy in this crate requires some kind of mutable scratch space to use -*during search*. For example, when executing a DFA, its states are computed -lazily and reused on subsequent searches. Those states go into that mutable -scratch space. - -The mutable scratch space is an implementation detail, and in general, its -mutation should not be observable from users of this crate. Therefore, it uses -interior mutability. This implies that `Regex` can either only be used from one -thread, or it must do some sort of synchronization. Either choice is -reasonable, but this crate chooses the latter, in particular because it is -ergonomic and makes use with `lazy_static!` straight forward. - -Synchronization implies *some* amount of overhead. When a `Regex` is used from -a single thread, this overhead is negligible. When a `Regex` is used from -multiple threads simultaneously, it is possible for the overhead of -synchronization from contention to impact performance. The specific cases where -contention may happen is if you are calling any of these methods repeatedly -from multiple threads simultaneously: - -* shortest_match -* is_match -* find -* captures - -In particular, every invocation of one of these methods must synchronize with -other threads to retrieve its mutable scratch space before searching can start. -If, however, you are using one of these methods: - -* find_iter -* captures_iter - -Then you may not suffer from contention since the cost of synchronization is -amortized on *construction of the iterator*. That is, the mutable scratch space -is obtained when the iterator is created and retained throughout its lifetime. - -## Only ask for what you need - -**Advice**: Prefer in this order: `is_match`, `find`, `captures`. - -There are three primary search methods on a `Regex`: - -* is_match -* find -* captures - -In general, these are ordered from fastest to slowest. - -`is_match` is fastest because it doesn't actually need to find the start or the -end of the leftmost-first match. It can quit immediately after it knows there -is a match. For example, given the regex `a+` and the haystack, `aaaaa`, the -search will quit after examining the first byte. - -In contrast, `find` must return both the start and end location of the -leftmost-first match. It can use the DFA matcher for this, but must run it -forwards once to find the end of the match *and then run it backwards* to find -the start of the match. The two scans and the cost of finding the real end of -the leftmost-first match make this more expensive than `is_match`. - -`captures` is the most expensive of them all because it must do what `find` -does, and then run either the bounded backtracker or the Pike VM to fill in the -capture group locations. Both of these are simulations of an NFA, which must -spend a lot of time shuffling states around. The DFA limits the performance hit -somewhat by restricting the amount of text that must be searched via an NFA -simulation. - -One other method not mentioned is `shortest_match`. This method has precisely -the same performance characteristics as `is_match`, except it will return the -end location of when it discovered a match. For example, given the regex `a+` -and the haystack `aaaaa`, `shortest_match` may return `1` as opposed to `5`, -the latter of which being the correct end location of the leftmost-first match. - -## Literals in your regex may make it faster - -**Advice**: Literals can reduce the work that the regex engine needs to do. Use -them if you can, especially as prefixes. - -In particular, if your regex starts with a prefix literal, the prefix is -quickly searched before entering the (much slower) regex engine. For example, -given the regex `foo\w+`, the literal `foo` will be searched for using -Boyer-Moore. If there's no match, then no regex engine is ever used. Only when -there's a match is the regex engine invoked at the location of the match, which -effectively permits the regex engine to skip large portions of a haystack. -If a regex is comprised entirely of literals (possibly more than one), then -it's possible that the regex engine can be avoided entirely even when there's a -match. - -When one literal is found, Boyer-Moore is used. When multiple literals are -found, then an optimized version of Aho-Corasick is used. - -This optimization is in particular extended quite a bit in this crate. Here are -a few examples of regexes that get literal prefixes detected: - -* `(foo|bar)` detects `foo` and `bar` -* `(a|b)c` detects `ac` and `bc` -* `[ab]foo[yz]` detects `afooy`, `afooz`, `bfooy` and `bfooz` -* `a?b` detects `a` and `b` -* `a*b` detects `a` and `b` -* `(ab){3,6}` detects `ababab` - -Literals in anchored regexes can also be used for detecting non-matches very -quickly. For example, `^foo\w+` and `\w+foo$` may be able to detect a non-match -just by examining the first (or last) three bytes of the haystack. - -## Unicode word boundaries may prevent the DFA from being used - -**Advice**: In most cases, `\b` should work well. If not, use `(?-u:\b)` -instead of `\b` if you care about consistent performance more than correctness. - -It's a sad state of the current implementation. At the moment, the DFA will try -to interpret Unicode word boundaries as if they were ASCII word boundaries. -If the DFA comes across any non-ASCII byte, it will quit and fall back to an -alternative matching engine that can handle Unicode word boundaries correctly. -The alternate matching engine is generally quite a bit slower (perhaps by an -order of magnitude). If necessary, this can be ameliorated in two ways. - -The first way is to add some number of literal prefixes to your regular -expression. Even though the DFA may not be used, specialized routines will -still kick in to find prefix literals quickly, which limits how much work the -NFA simulation will need to do. - -The second way is to give up on Unicode and use an ASCII word boundary instead. -One can use an ASCII word boundary by disabling Unicode support. That is, -instead of using `\b`, use `(?-u:\b)`. Namely, given the regex `\b.+\b`, it -can be transformed into a regex that uses the DFA with `(?-u:\b).+(?-u:\b)`. It -is important to limit the scope of disabling the `u` flag, since it might lead -to a syntax error if the regex could match arbitrary bytes. For example, if one -wrote `(?-u)\b.+\b`, then a syntax error would be returned because `.` matches -any *byte* when the Unicode flag is disabled. - -The second way isn't appreciably different than just using a Unicode word -boundary in the first place, since the DFA will speculatively interpret it as -an ASCII word boundary anyway. The key difference is that if an ASCII word -boundary is used explicitly, then the DFA won't quit in the presence of -non-ASCII UTF-8 bytes. This results in giving up correctness in exchange for -more consistent performance. - -N.B. When using `bytes::Regex`, Unicode support is disabled by default, so one -can simply write `\b` to get an ASCII word boundary. - -## Excessive counting can lead to exponential state blow up in the DFA - -**Advice**: Don't write regexes that cause DFA state blow up if you care about -match performance. - -Wait, didn't I say that this crate guards against exponential worst cases? -Well, it turns out that the process of converting an NFA to a DFA can lead to -an exponential blow up in the number of states. This crate specifically guards -against exponential blow up by doing two things: - -1. The DFA is computed lazily. That is, a state in the DFA only exists in - memory if it is visited. In particular, the lazy DFA guarantees that *at - most* one state is created for every byte of input. This, on its own, - guarantees linear time complexity. -2. Of course, creating a new state for *every* byte of input means that search - will go incredibly slow because of very large constant factors. On top of - that, creating a state for every byte in a large haystack could result in - exorbitant memory usage. To ameliorate this, the DFA bounds the number of - states it can store. Once it reaches its limit, it flushes its cache. This - prevents reuse of states that it already computed. If the cache is flushed - too frequently, then the DFA will give up and execution will fall back to - one of the NFA simulations. - -In effect, this crate will detect exponential state blow up and fall back to -a search routine with fixed memory requirements. This does, however, mean that -searching will be much slower than one might expect. Regexes that rely on -counting in particular are strong aggravators of this behavior. For example, -matching `[01]*1[01]{20}$` against a random sequence of `0`s and `1`s. - -In the future, it may be possible to increase the bound that the DFA uses, -which would allow the caller to choose how much memory they're willing to -spend. - -## Resist the temptation to "optimize" regexes - -**Advice**: This ain't a backtracking engine. - -An entire book was written on how to optimize Perl-style regular expressions. -Most of those techniques are not applicable for this library. For example, -there is no problem with using non-greedy matching or having lots of -alternations in your regex. diff --git a/README.md b/README.md index 020b353956..a9d6fcd373 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,17 @@ regex ===== -A Rust library for parsing, compiling, and executing regular expressions. Its -syntax is similar to Perl-style regular expressions, but lacks a few features -like look around and backreferences. In exchange, all searches execute in -linear time with respect to the size of the regular expression and search text. -Much of the syntax and implementation is inspired -by [RE2](https://github.com/google/re2). +This crate provides routines for searching strings for matches of a [regular +expression] (aka "regex"). The regex syntax supported by this crate is similar +to other regex engines, but it lacks several features that are not known how to +implement efficiently. This includes, but is not limited to, look-around and +backreferences. In exchange, all regex searches in this crate have worst case +`O(m * n)` time complexity, where `m` is proportional to the size of the regex +and `n` is proportional to the size of the string being searched. + +[regular expression]: https://en.wikipedia.org/wiki/Regular_expression [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![Crates.io](https://img.shields.io/crates/v/regex.svg)](https://crates.io/crates/regex) -[![Rust](https://img.shields.io/badge/rust-1.60.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation @@ -40,8 +42,8 @@ fn main() { - (?P\d{2}) # the day ").unwrap(); - let caps = re.captures("2010-03-14").unwrap(); + let caps = re.captures("2010-03-14").unwrap(); assert_eq!("2010", &caps["year"]); assert_eq!("03", &caps["month"]); assert_eq!("14", &caps["day"]); @@ -54,32 +56,21 @@ easy to adapt the above example with an iterator: ```rust use regex::Regex; -const TO_SEARCH: &'static str = " -On 2010-03-14, foo happened. On 2014-10-14, bar happened. -"; - fn main() { let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); + let hay = "On 2010-03-14, foo happened. On 2014-10-14, bar happened."; - for caps in re.captures_iter(TO_SEARCH) { - // Note that all of the unwraps are actually OK for this regex - // because the only way for the regex to match is if all of the - // capture groups match. This is not true in general though! - println!("year: {}, month: {}, day: {}", - caps.get(1).unwrap().as_str(), - caps.get(2).unwrap().as_str(), - caps.get(3).unwrap().as_str()); + let mut dates = vec![]; + for (_, [year, month, day]) in re.captures_iter(hay).map(|c| c.extract()) { + dates.push((year, month, day)); } + assert_eq!(dates, vec![ + ("2010", "03", "14"), + ("2014", "10", "14"), + ]); } ``` -This example outputs: - -```text -year: 2010, month: 03, day: 14 -year: 2014, month: 10, day: 14 -``` - ### Usage: Avoid compiling the same regex in a loop It is an anti-pattern to compile the same regular expression in a loop since @@ -90,19 +81,23 @@ allocations internally to the matching engines. In Rust, it can sometimes be a pain to pass regular expressions around if they're used from inside a helper function. Instead, we recommend using the -[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that -regular expressions are compiled exactly once. - -For example: +[`once_cell`](https://crates.io/crates/once_cell) crate to ensure that +regular expressions are compiled exactly once. For example: -```rust,ignore -use regex::Regex; +```rust +use { + once_cell::sync::Lazy, + regex::Regex, +}; + +fn some_helper_function(haystack: &str) -> bool { + static RE: Lazy = Lazy::new(|| Regex::new(r"...").unwrap()); + RE.is_match(haystack) +} -fn some_helper_function(text: &str) -> bool { - lazy_static! { - static ref RE: Regex = Regex::new("...").unwrap(); - } - RE.is_match(text) +fn main() { + assert!(some_helper_function("abc")); + assert!(!some_helper_function("ac")); } ``` @@ -115,19 +110,21 @@ The main API of this crate (`regex::Regex`) requires the caller to pass a `&str` for searching. In Rust, an `&str` is required to be valid UTF-8, which means the main API can't be used for searching arbitrary bytes. -To match on arbitrary bytes, use the `regex::bytes::Regex` API. The API -is identical to the main API, except that it takes an `&[u8]` to search -on instead of an `&str`. By default, `.` will match any *byte* using -`regex::bytes::Regex`, while `.` will match any *UTF-8 encoded Unicode scalar -value* using the main API. +To match on arbitrary bytes, use the `regex::bytes::Regex` API. The API is +identical to the main API, except that it takes an `&[u8]` to search on instead +of an `&str`. The `&[u8]` APIs also permit disabling Unicode mode in the regex +even when the pattern would match invalid UTF-8. For example, `(?-u:.)` is +not allowed in `regex::Regex` but is allowed in `regex::bytes::Regex` since +`(?-u:.)` matches any byte except for `\n`. Conversely, `.` will match the +UTF-8 encoding of any Unicode scalar value except for `\n`. This example shows how to find all null-terminated strings in a slice of bytes: ```rust use regex::bytes::Regex; -let re = Regex::new(r"(?P[^\x00]+)\x00").unwrap(); -let text = b"foo\x00bar\x00baz\x00"; +let re = Regex::new(r"(?-u)(?[^\x00]+)\x00").unwrap(); +let text = b"foo\xFFbar\x00baz\x00"; // Extract all of the strings without the null terminator from each match. // The unwrap is OK here since a match requires the `cstr` capture to match. @@ -135,12 +132,12 @@ let cstrs: Vec<&[u8]> = re.captures_iter(text) .map(|c| c.name("cstr").unwrap().as_bytes()) .collect(); -assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); +assert_eq!(vec![&b"foo\xFFbar"[..], &b"baz"[..]], cstrs); ``` -Notice here that the `[^\x00]+` will match any *byte* except for `NUL`. When -using the main API, `[^\x00]+` would instead match any valid UTF-8 sequence -except for `NUL`. +Notice here that the `[^\x00]+` will match any *byte* except for `NUL`, +including bytes like `\xFF` which are not valid UTF-8. When using the main API, +`[^\x00]+` would instead match any valid UTF-8 sequence except for `NUL`. ### Usage: match multiple regular expressions simultaneously @@ -170,11 +167,15 @@ assert!(!matches.matched(5)); assert!(matches.matched(6)); ``` -### Usage: enable SIMD optimizations -SIMD optimizations are enabled automatically on Rust stable 1.27 and newer. -For nightly versions of Rust, this requires a recent version with the SIMD -features stabilized. +### Usage: regex internals as a library + +The [`regex-automata` directory](./regex-automata/) contains a crate that +exposes all of the internal matching engines used by the `regex` crate. The +idea is that the `regex` crate exposes a simple API for 99% of use cases, but +`regex-automata` exposes oodles of customizable behaviors. + +[Documentation for `regex-automata`.](https://docs.rs/regex-automata) ### Usage: a regular expression parser @@ -186,7 +187,7 @@ This may be useful if you're implementing your own regex engine or otherwise need to do analysis on the syntax of a regular expression. It is otherwise not recommended for general use. -[Documentation `regex-syntax`.](https://docs.rs/regex-syntax) +[Documentation for `regex-syntax`.](https://docs.rs/regex-syntax) ### Crate features @@ -205,29 +206,28 @@ all such features, use the following `Cargo.toml` dependency configuration: [dependencies.regex] version = "1.3" default-features = false -# regex currently requires the standard library, you must re-enable it. +# Unless you have a specific reason not to, it's good sense to enable standard +# library support. It enables several optimizations and avoids spin locks. It +# also shouldn't meaningfully impact compile times or binary size. features = ["std"] ``` -This will reduce the dependency tree of `regex` down to a single crate -(`regex-syntax`). +This will reduce the dependency tree of `regex` down to two crates: +`regex-syntax` and `regex-automata`. The full set of features one can disable are -[in the "Crate features" section of the documentation](https://docs.rs/regex/*/#crate-features). +[in the "Crate features" section of the documentation](https://docs.rs/regex/1.*/#crate-features). ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.60.0`. -The current **tentative** policy is that the minimum Rust version required -to use this crate can be increased in minor version updates. For example, if -regex 1.0 requires Rust 1.20.0, then regex 1.0.z for all values of `z` will -also require Rust 1.20.0 or newer. However, regex 1.y for `y > 0` may require a -newer minimum version of Rust. - -In general, this crate will be conservative with respect to the minimum -supported version of Rust. +The policy is that the minimum Rust version required to use this crate can be +increased in minor version updates. For example, if regex 1.0 requires Rust +1.20.0, then regex 1.0.z for all values of `z` will also require Rust 1.20.0 or +newer. However, regex 1.y for `y > 0` may require a newer minimum version of +Rust. ### License diff --git a/UNICODE.md b/UNICODE.md index df7d21ed97..60db0aad1f 100644 --- a/UNICODE.md +++ b/UNICODE.md @@ -8,7 +8,8 @@ Full support for Level 1 ("Basic Unicode Support") is provided with two exceptions: 1. Line boundaries are not Unicode aware. Namely, only the `\n` - (`END OF LINE`) character is recognized as a line boundary. + (`END OF LINE`) character is recognized as a line boundary by default. + One can opt into `\r\n|\r|\n` being a line boundary via CRLF mode. 2. The compatibility properties specified by [RL1.2a](https://unicode.org/reports/tr18/#RL1.2a) are ASCII-only definitions. @@ -229,12 +230,10 @@ then all characters classes are case folded as well. [UTS#18 RL1.6](https://unicode.org/reports/tr18/#Line_Boundaries) The regex crate only provides support for recognizing the `\n` (`END OF LINE`) -character as a line boundary. This choice was made mostly for implementation -convenience, and to avoid performance cliffs that Unicode word boundaries are -subject to. - -Ideally, it would be nice to at least support `\r\n` as a line boundary as -well, and in theory, this could be done efficiently. +character as a line boundary by default. One can also opt into treating +`\r\n|\r|\n` as a line boundary via CRLF mode. This choice was made mostly for +implementation convenience, and to avoid performance cliffs that Unicode word +boundaries are subject to. ## RL1.7 Code Points diff --git a/examples/regexdna-input.txt b/examples/regexdna-input.txt deleted file mode 100644 index fb23263397..0000000000 --- a/examples/regexdna-input.txt +++ /dev/null @@ -1,1671 +0,0 @@ ->ONE Homo sapiens alu -GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA -TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT -AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG -GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG -CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT -GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA -GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA -TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG -AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA -GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT -AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC -AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG -GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC -CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG -AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT -TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA -TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT -GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG -TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT -CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG -CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG -TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA -CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG -AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG -GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC -TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA -TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA -GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT -GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC -ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT -TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC -CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG -CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG -GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC -CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT -GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC -GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA -GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA -GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA -GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG -AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT -CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA -GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA -AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC -GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT -ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG -GAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATC -GCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGC -GGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGG -TCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAA -AAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAG -GAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACT -CCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCC -TGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAG -ACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGC -GTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGA -ACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGA -CAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCA -CTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCA -ACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCG -CCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGG -AGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTC -CGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCG -AGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACC -CCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAG -CTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAG -CCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGG -CCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATC -ACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAA -AAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGC -TGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCC -ACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGG -CTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGG -AGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATT -AGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAA -TCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGC -CTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAA -TCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAG -CCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGT -GGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCG -GGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAG -CGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTG -GGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATG -GTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGT -AATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTT -GCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCT -CAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCG -GGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTC -TCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACT -CGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAG -ATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGG -CGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTG -AGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATA -CAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGG -CAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGC -ACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCAC -GCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTC -GAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCG -GGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCT -TGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGG -CGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCA -GCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGG -CCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGC -GCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGG -CGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGA -CTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGG -CCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAA -ACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCC -CAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGT -GAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAA -AGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGG -ATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTAC -TAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGA -GGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGC -GCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGG -TGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTC -AGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAA -ATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGA -GAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCC -AGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTG -TAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGAC -CAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGT -GGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAAC -CCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACA -GAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACT -TTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAAC -ATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCC -TGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAG -GTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCG -TCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAG -GCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCC -GTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCT -ACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCC -GAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCC -GGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCAC -CTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAA -ATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTG -AGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCAC -TGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCT -CACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAG -TTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAG -CCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATC -GCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCT -GGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATC -CCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCC -TGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGG -CGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGG -AGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCG -AGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGG -AGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGT -GAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAA -TCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGC -AGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCA -AAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGG -CGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTC -TACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCG -GGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGAT -CGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCG -CGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAG -GTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACA -AAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCA -GGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCAC -TCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGC -CTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGA -GACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGG -CGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTG -AACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCG -ACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGC -ACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCC -AACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGC -GCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCG -GAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACT -CCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCC -GAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAAC -CCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCA -GCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGA -GCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAG -GCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGAT -CACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTA -AAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGG -CTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGC -CACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTG -GCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAG -GAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAAT -TAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGA -ATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAG -CCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTA -ATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCA -GCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGG -TGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCC -GGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGA -GCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTT -GGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACAT -GGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTG -TAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGT -TGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTC -TCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGC -GGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGT -CTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTAC -TCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGA -GATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGG -GCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCT -GAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAAT -ACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAG -GCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTG -CACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCA -CGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTT -CGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCC -GGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGC -TTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGG -GCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCC -AGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTG -GCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCG -CGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAG -GCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAG -ACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAG -GCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGA -AACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATC -CCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAG -TGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAA -AAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCG -GATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTA -CTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGG -AGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCG -CGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCG -GTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGT -CAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAA -AATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGG -AGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTC -CAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCT -GTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGA -CCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCG -TGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAA -CCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGAC -AGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCAC -TTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAA -CATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGC -CTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGA -GGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCC -GTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGA -GGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCC -CGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGC -TACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGC -CGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGC -CGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCA -CCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAA -AATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCT -GAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCA -CTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGC -TCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGA -GTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTA -GCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAAT -CGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCC -TGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAAT -CCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGC -CTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTG -GCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGG -GAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGC -GAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGG -GAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGG -TGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTA -ATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTG -CAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTC -AAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGG -GCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCT -CTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTC -GGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGA -TCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGC -GCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGA -GGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATAC -AAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGC -AGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCA -CTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACG -CCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCG -AGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGG -GCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTT -GAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGC -GACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAG -CACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGC -CAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCG -CGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGC -GGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGAC -TCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGC -CGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAA -CCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCC -AGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTG -AGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAA -GGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGA -TCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACT -AAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAG -GCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCG -CCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGT -GGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCA -GGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAA -TTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAG -AATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCA -GCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGT -AATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACC -AGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTG -GTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACC -CGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAG -AGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTT -TGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACA -TGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCT -GTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGG -TTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGT -CTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGG -CGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCG -TCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTA -CTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCG -AGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCG -GGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACC -TGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAA -TACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGA -GGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACT -GCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTC -ACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGT -TCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGC -CGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCG -CTTGAACCCGGGAGGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTG -GGCGACAGAGCGAGACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCC -CAGCACTTTGGGAGGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCT -GGCCAACATGGTGAAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGC -GCGCGCCTGTAATCCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGA -GGCGGAGGTTGCAGTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGA -GACTCCGTCTCAAAAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGA -GGCCGAGGCGGGCGGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTG -AAACCCCGTCTCTACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAAT -CCCAGCTACTCGGGAGGCTGAGGCAGGAGAATCGCTTGAACCCGGGAGGCGGAGGTTGCA -GTGAGCCGAGATCGCGCCACTGCACTCCAGCCTGGGCGACAGAGCGAGACTCCGTCTCAA -AAAGGCCGGGCGCGGTGGCTCACGCCTGTAATCCCAGCACTTTGGGAGGCCGAGGCGGGC -GGATCACCTGAGGTCAGGAGTTCGAGACCAGCCTGGCCAACATGGTGAAACCCCGTCTCT -ACTAAAAATACAAAAATTAGCCGGGCGTGGTGGCGCGCGCCTGTAATCCCAGCTACTCGG -GAGGCTGAGGCAGGAGAATC ->TWO IUB ambiguity codes -cttBtatcatatgctaKggNcataaaSatgtaaaDcDRtBggDtctttataattcBgtcg -tactDtDagcctatttSVHtHttKtgtHMaSattgWaHKHttttagacatWatgtRgaaa -NtactMcSMtYtcMgRtacttctWBacgaaatatagScDtttgaagacacatagtVgYgt -cattHWtMMWcStgttaggKtSgaYaaccWStcgBttgcgaMttBYatcWtgacaYcaga -gtaBDtRacttttcWatMttDBcatWtatcttactaBgaYtcttgttttttttYaaScYa -HgtgttNtSatcMtcVaaaStccRcctDaataataStcYtRDSaMtDttgttSagtRRca -tttHatSttMtWgtcgtatSSagactYaaattcaMtWatttaSgYttaRgKaRtccactt -tattRggaMcDaWaWagttttgacatgttctacaaaRaatataataaMttcgDacgaSSt -acaStYRctVaNMtMgtaggcKatcttttattaaaaagVWaHKYagtttttatttaacct -tacgtVtcVaattVMBcttaMtttaStgacttagattWWacVtgWYagWVRctDattBYt -gtttaagaagattattgacVatMaacattVctgtBSgaVtgWWggaKHaatKWcBScSWa -accRVacacaaactaccScattRatatKVtactatatttHttaagtttSKtRtacaaagt -RDttcaaaaWgcacatWaDgtDKacgaacaattacaRNWaatHtttStgttattaaMtgt -tgDcgtMgcatBtgcttcgcgaDWgagctgcgaggggVtaaScNatttacttaatgacag -cccccacatYScaMgtaggtYaNgttctgaMaacNaMRaacaaacaKctacatagYWctg -ttWaaataaaataRattagHacacaagcgKatacBttRttaagtatttccgatctHSaat -actcNttMaagtattMtgRtgaMgcataatHcMtaBSaRattagttgatHtMttaaKagg -YtaaBataSaVatactWtataVWgKgttaaaacagtgcgRatatacatVtHRtVYataSa -KtWaStVcNKHKttactatccctcatgWHatWaRcttactaggatctataDtDHBttata -aaaHgtacVtagaYttYaKcctattcttcttaataNDaaggaaaDYgcggctaaWSctBa -aNtgctggMBaKctaMVKagBaactaWaDaMaccYVtNtaHtVWtKgRtcaaNtYaNacg -gtttNattgVtttctgtBaWgtaattcaagtcaVWtactNggattctttaYtaaagccgc -tcttagHVggaYtgtNcDaVagctctctKgacgtatagYcctRYHDtgBattDaaDgccK -tcHaaStttMcctagtattgcRgWBaVatHaaaataYtgtttagMDMRtaataaggatMt -ttctWgtNtgtgaaaaMaatatRtttMtDgHHtgtcattttcWattRSHcVagaagtacg -ggtaKVattKYagactNaatgtttgKMMgYNtcccgSKttctaStatatNVataYHgtNa -BKRgNacaactgatttcctttaNcgatttctctataScaHtataRagtcRVttacDSDtt -aRtSatacHgtSKacYagttMHtWataggatgactNtatSaNctataVtttRNKtgRacc -tttYtatgttactttttcctttaaacatacaHactMacacggtWataMtBVacRaSaatc -cgtaBVttccagccBcttaRKtgtgcctttttRtgtcagcRttKtaaacKtaaatctcac -aattgcaNtSBaaccgggttattaaBcKatDagttactcttcattVtttHaaggctKKga -tacatcBggScagtVcacattttgaHaDSgHatRMaHWggtatatRgccDttcgtatcga -aacaHtaagttaRatgaVacttagattVKtaaYttaaatcaNatccRttRRaMScNaaaD -gttVHWgtcHaaHgacVaWtgttScactaagSgttatcttagggDtaccagWattWtRtg -ttHWHacgattBtgVcaYatcggttgagKcWtKKcaVtgaYgWctgYggVctgtHgaNcV -taBtWaaYatcDRaaRtSctgaHaYRttagatMatgcatttNattaDttaattgttctaa -ccctcccctagaWBtttHtBccttagaVaatMcBHagaVcWcagBVttcBtaYMccagat -gaaaaHctctaacgttagNWRtcggattNatcRaNHttcagtKttttgWatWttcSaNgg -gaWtactKKMaacatKatacNattgctWtatctaVgagctatgtRaHtYcWcttagccaa -tYttWttaWSSttaHcaaaaagVacVgtaVaRMgattaVcDactttcHHggHRtgNcctt -tYatcatKgctcctctatVcaaaaKaaaagtatatctgMtWtaaaacaStttMtcgactt -taSatcgDataaactaaacaagtaaVctaggaSccaatMVtaaSKNVattttgHccatca -cBVctgcaVatVttRtactgtVcaattHgtaaattaaattttYtatattaaRSgYtgBag -aHSBDgtagcacRHtYcBgtcacttacactaYcgctWtattgSHtSatcataaatataHt -cgtYaaMNgBaatttaRgaMaatatttBtttaaaHHKaatctgatWatYaacttMctctt -ttVctagctDaaagtaVaKaKRtaacBgtatccaaccactHHaagaagaaggaNaaatBW -attccgStaMSaMatBttgcatgRSacgttVVtaaDMtcSgVatWcaSatcttttVatag -ttactttacgatcaccNtaDVgSRcgVcgtgaacgaNtaNatatagtHtMgtHcMtagaa -attBgtataRaaaacaYKgtRccYtatgaagtaataKgtaaMttgaaRVatgcagaKStc -tHNaaatctBBtcttaYaBWHgtVtgacagcaRcataWctcaBcYacYgatDgtDHccta -aagacYRcaggattHaYgtKtaatgcVcaataMYacccatatcacgWDBtgaatcBaata -cKcttRaRtgatgaBDacggtaattaaYtataStgVHDtDctgactcaaatKtacaatgc -gYatBtRaDatHaactgtttatatDttttaaaKVccYcaaccNcBcgHaaVcattHctcg -attaaatBtatgcaaaaatYMctSactHatacgaWacattacMBgHttcgaatVaaaaca -BatatVtctgaaaaWtctRacgBMaatSgRgtgtcgactatcRtattaScctaStagKga -DcWgtYtDDWKRgRtHatRtggtcgaHgggcgtattaMgtcagccaBggWVcWctVaaat -tcgNaatcKWagcNaHtgaaaSaaagctcYctttRVtaaaatNtataaccKtaRgtttaM -tgtKaBtRtNaggaSattHatatWactcagtgtactaKctatttgRYYatKatgtccgtR -tttttatttaatatVgKtttgtatgtNtataRatWYNgtRtHggtaaKaYtKSDcatcKg -taaYatcSRctaVtSMWtVtRWHatttagataDtVggacagVcgKWagBgatBtaaagNc -aRtagcataBggactaacacRctKgttaatcctHgDgttKHHagttgttaatgHBtatHc -DaagtVaBaRccctVgtgDtacRHSctaagagcggWYaBtSaKtHBtaaactYacgNKBa -VYgtaacttagtVttcttaatgtBtatMtMtttaattaatBWccatRtttcatagVgMMt -agctStKctaMactacDNYgKYHgaWcgaHgagattacVgtttgtRaSttaWaVgataat -gtgtYtaStattattMtNgWtgttKaccaatagNYttattcgtatHcWtctaaaNVYKKt -tWtggcDtcgaagtNcagatacgcattaagaccWctgcagcttggNSgaNcHggatgtVt -catNtRaaBNcHVagagaaBtaaSggDaatWaatRccaVgggStctDaacataKttKatt -tggacYtattcSatcttagcaatgaVBMcttDattctYaaRgatgcattttNgVHtKcYR -aatRKctgtaaacRatVSagctgtWacBtKVatctgttttKcgtctaaDcaagtatcSat -aWVgcKKataWaYttcccSaatgaaaacccWgcRctWatNcWtBRttYaattataaNgac -acaatagtttVNtataNaYtaatRaVWKtBatKagtaatataDaNaaaaataMtaagaaS -tccBcaatNgaataWtHaNactgtcDtRcYaaVaaaaaDgtttRatctatgHtgttKtga -aNSgatactttcgagWaaatctKaaDaRttgtggKKagcDgataaattgSaacWaVtaNM -acKtcaDaaatttctRaaVcagNacaScRBatatctRatcctaNatWgRtcDcSaWSgtt -RtKaRtMtKaatgttBHcYaaBtgatSgaSWaScMgatNtctcctatttctYtatMatMt -RRtSaattaMtagaaaaStcgVgRttSVaScagtgDtttatcatcatacRcatatDctta -tcatVRtttataaHtattcYtcaaaatactttgVctagtaaYttagatagtSYacKaaac -gaaKtaaatagataatSatatgaaatSgKtaatVtttatcctgKHaatHattagaaccgt -YaaHactRcggSBNgtgctaaBagBttgtRttaaattYtVRaaaattgtaatVatttctc -ttcatgBcVgtgKgaHaaatattYatagWacNctgaaMcgaattStagWaSgtaaKagtt -ttaagaDgatKcctgtaHtcatggKttVDatcaaggtYcgccagNgtgcVttttagagat -gctaccacggggtNttttaSHaNtatNcctcatSaaVgtactgBHtagcaYggYVKNgta -KBcRttgaWatgaatVtagtcgattYgatgtaatttacDacSctgctaaaStttaWMagD -aaatcaVYctccgggcgaVtaaWtStaKMgDtttcaaMtVgBaatccagNaaatcYRMBg -gttWtaaScKttMWtYataRaDBMaDataatHBcacDaaKDactaMgagttDattaHatH -taYatDtattDcRNStgaatattSDttggtattaaNSYacttcDMgYgBatWtaMagact -VWttctttgYMaYaacRgHWaattgRtaagcattctMKVStatactacHVtatgatcBtV -NataaBttYtSttacKgggWgYDtgaVtYgatDaacattYgatggtRDaVDttNactaSa -MtgNttaacaaSaBStcDctaccacagacgcaHatMataWKYtaYattMcaMtgSttDag -cHacgatcaHttYaKHggagttccgatYcaatgatRaVRcaagatcagtatggScctata -ttaNtagcgacgtgKaaWaactSgagtMYtcttccaKtStaacggMtaagNttattatcg -tctaRcactctctDtaacWYtgaYaSaagaWtNtatttRacatgNaatgttattgWDDcN -aHcctgaaHacSgaataaRaataMHttatMtgaSDSKatatHHaNtacagtccaYatWtc -actaactatKDacSaStcggataHgYatagKtaatKagStaNgtatactatggRHacttg -tattatgtDVagDVaRctacMYattDgtttYgtctatggtKaRSttRccRtaaccttaga -gRatagSaaMaacgcaNtatgaaatcaRaagataatagatactcHaaYKBctccaagaRa -BaStNagataggcgaatgaMtagaatgtcaKttaaatgtaWcaBttaatRcggtgNcaca -aKtttScRtWtgcatagtttWYaagBttDKgcctttatMggNttattBtctagVtacata -aaYttacacaaRttcYtWttgHcaYYtaMgBaBatctNgcDtNttacgacDcgataaSat -YaSttWtcctatKaatgcagHaVaacgctgcatDtgttaSataaaaYSNttatagtaNYt -aDaaaNtggggacttaBggcHgcgtNtaaMcctggtVtaKcgNacNtatVaSWctWtgaW -cggNaBagctctgaYataMgaagatBSttctatacttgtgtKtaattttRagtDtacata -tatatgatNHVgBMtKtaKaNttDHaagatactHaccHtcatttaaagttVaMcNgHata -tKtaNtgYMccttatcaaNagctggacStttcNtggcaVtattactHaSttatgNMVatt -MMDtMactattattgWMSgtHBttStStgatatRaDaagattttctatMtaaaaaggtac -taaVttaSacNaatactgMttgacHaHRttgMacaaaatagttaatatWKRgacDgaRta -tatttattatcYttaWtgtBRtWatgHaaattHataagtVaDtWaVaWtgStcgtMSgaS -RgMKtaaataVacataatgtaSaatttagtcgaaHtaKaatgcacatcggRaggSKctDc -agtcSttcccStYtccRtctctYtcaaKcgagtaMttttcRaYDttgttatctaatcata -NctctgctatcaMatactataggDaHaaSttMtaDtcNatataattctMcStaaBYtaNa -gatgtaatHagagSttgWHVcttatKaYgDctcttggtgttMcRaVgSgggtagacaata -aDtaattSaDaNaHaBctattgNtaccaaRgaVtKNtaaYggHtaKKgHcatctWtctDt -ttctttggSDtNtaStagttataaacaattgcaBaBWggHgcaaaBtYgctaatgaaatW -cDcttHtcMtWWattBHatcatcaaatctKMagtDNatttWaBtHaaaNgMttaaStagt -tctctaatDtcRVaYttgttMtRtgtcaSaaYVgSWDRtaatagctcagDgcWWaaaBaa -RaBctgVgggNgDWStNaNBKcBctaaKtttDcttBaaggBttgaccatgaaaNgttttt -tttatctatgttataccaaDRaaSagtaVtDtcaWatBtacattaWacttaSgtattggD -gKaaatScaattacgWcagKHaaccaYcRcaRttaDttRtttHgaHVggcttBaRgtccc -tDatKaVtKtcRgYtaKttacgtatBtStaagcaattaagaRgBagSaattccSWYttta -ttVaataNctgHgttaaNBgcVYgtRtcccagWNaaaacaDNaBcaaaaRVtcWMgBagM -tttattacgDacttBtactatcattggaaatVccggttRttcatagttVYcatYaSHaHc -ttaaagcNWaHataaaRWtctVtRYtagHtaaaYMataHYtNBctNtKaatattStgaMc -BtRgctaKtgcScSttDgYatcVtggaaKtaagatWccHccgKYctaNNctacaWctttt -gcRtgtVcgaKttcMRHgctaHtVaataaDtatgKDcttatBtDttggNtacttttMtga -acRattaaNagaactcaaaBBVtcDtcgaStaDctgaaaSgttMaDtcgttcaccaaaag -gWtcKcgSMtcDtatgtttStaaBtatagDcatYatWtaaaBacaKgcaDatgRggaaYc -taRtccagattDaWtttggacBaVcHtHtaacDacYgtaatataMagaatgHMatcttat -acgtatttttatattacHactgttataMgStYaattYaccaattgagtcaaattaYtgta -tcatgMcaDcgggtcttDtKgcatgWRtataatatRacacNRBttcHtBgcRttgtgcgt -catacMtttBctatctBaatcattMttMYgattaaVYatgDaatVagtattDacaacDMa -tcMtHcccataagatgBggaccattVWtRtSacatgctcaaggggYtttDtaaNgNtaaB -atggaatgtctRtaBgBtcNYatatNRtagaacMgagSaSDDSaDcctRagtVWSHtVSR -ggaacaBVaccgtttaStagaacaMtactccagtttVctaaRaaHttNcttagcaattta -ttaatRtaaaatctaacDaBttggSagagctacHtaaRWgattcaaBtctRtSHaNtgta -cattVcaHaNaagtataccacaWtaRtaaVKgMYaWgttaKggKMtKcgWatcaDatYtK -SttgtacgaccNctSaattcDcatcttcaaaDKttacHtggttHggRRaRcaWacaMtBW -VHSHgaaMcKattgtaRWttScNattBBatYtaNRgcggaagacHSaattRtttcYgacc -BRccMacccKgatgaacttcgDgHcaaaaaRtatatDtatYVtttttHgSHaSaatagct -NYtaHYaVYttattNtttgaaaYtaKttWtctaNtgagaaaNctNDctaaHgttagDcRt -tatagccBaacgcaRBtRctRtggtaMYYttWtgataatcgaataattattataVaaaaa -ttacNRVYcaaMacNatRttcKatMctgaagactaattataaYgcKcaSYaatMNctcaa -cgtgatttttBacNtgatDccaattattKWWcattttatatatgatBcDtaaaagttgaa -VtaHtaHHtBtataRBgtgDtaataMttRtDgDcttattNtggtctatctaaBcatctaR -atgNacWtaatgaagtcMNaacNgHttatactaWgcNtaStaRgttaaHacccgaYStac -aaaatWggaYaWgaattattcMaactcBKaaaRVNcaNRDcYcgaBctKaacaaaaaSgc -tccYBBHYaVagaatagaaaacagYtctVccaMtcgtttVatcaatttDRtgWctagtac -RttMctgtDctttcKtWttttataaatgVttgBKtgtKWDaWagMtaaagaaattDVtag -gttacatcatttatgtcgMHaVcttaBtVRtcgtaYgBRHatttHgaBcKaYWaatcNSc -tagtaaaaatttacaatcactSWacgtaatgKttWattagttttNaggtctcaagtcact -attcttctaagKggaataMgtttcataagataaaaatagattatDgcBVHWgaBKttDgc -atRHaagcaYcRaattattatgtMatatattgHDtcaDtcaaaHctStattaatHaccga -cNattgatatattttgtgtDtRatagSacaMtcRtcattcccgacacSattgttKaWatt -NHcaacttccgtttSRtgtctgDcgctcaaMagVtBctBMcMcWtgtaacgactctcttR -ggRKSttgYtYatDccagttDgaKccacgVatWcataVaaagaataMgtgataaKYaaat -cHDaacgataYctRtcYatcgcaMgtNttaBttttgatttaRtStgcaacaaaataccVg -aaDgtVgDcStctatatttattaaaaRKDatagaaagaKaaYYcaYSgKStctccSttac -agtcNactttDVttagaaagMHttRaNcSaRaMgBttattggtttaRMggatggcKDgWR -tNaataataWKKacttcKWaaagNaBttaBatMHtccattaacttccccYtcBcYRtaga -ttaagctaaYBDttaNtgaaaccHcaRMtKtaaHMcNBttaNaNcVcgVttWNtDaBatg -ataaVtcWKcttRggWatcattgaRagHgaattNtatttctctattaattaatgaDaaMa -tacgttgggcHaYVaaNaDDttHtcaaHtcVVDgBVagcMacgtgttaaBRNtatRtcag -taagaggtttaagacaVaaggttaWatctccgtVtaDtcDatttccVatgtacNtttccg -tHttatKgScBatgtVgHtYcWagcaKtaMYaaHgtaattaSaHcgcagtWNaatNccNN -YcacgVaagaRacttctcattcccRtgtgtaattagcSttaaStWaMtctNNcSMacatt -ataaactaDgtatWgtagtttaagaaaattgtagtNagtcaataaatttgatMMYactaa -tatcggBWDtVcYttcDHtVttatacYaRgaMaacaStaatcRttttVtagaDtcacWat -ttWtgaaaagaaagNRacDtttStVatBaDNtaactatatcBSMcccaSttccggaMatg -attaaWatKMaBaBatttgataNctgttKtVaagtcagScgaaaDggaWgtgttttKtWt -atttHaatgtagttcactaaKMagttSYBtKtaYgaactcagagRtatagtVtatcaaaW -YagcgNtaDagtacNSaaYDgatBgtcgataacYDtaaactacagWDcYKaagtttatta -gcatcgagttKcatDaattgattatDtcagRtWSKtcgNtMaaaaacaMttKcaWcaaSV -MaaaccagMVtaMaDtMaHaBgaacataBBVtaatVYaNSWcSgNtDNaaKacacBttta -tKtgtttcaaHaMctcagtaacgtcgYtactDcgcctaNgagagcYgatattttaaattt -ccattttacatttDaaRctattttWctttacgtDatYtttcagacgcaaVttagtaaKaa -aRtgVtccataBggacttatttgtttaWNtgttVWtaWNVDaattgtatttBaagcBtaa -BttaaVatcHcaVgacattccNggtcgacKttaaaRtagRtctWagaYggtgMtataatM -tgaaRttattttgWcttNtDRRgMDKacagaaaaggaaaRStcccagtYccVattaNaaK -StNWtgacaVtagaagcttSaaDtcacaacgDYacWDYtgtttKatcVtgcMaDaSKStV -cgtagaaWaKaagtttcHaHgMgMtctataagBtKaaaKKcactggagRRttaagaBaaN -atVVcgRcKSttDaactagtSttSattgttgaaRYatggttVttaataaHttccaagDtg -atNWtaagHtgcYtaactRgcaatgMgtgtRaatRaNaacHKtagactactggaatttcg -ccataacgMctRgatgttaccctaHgtgWaYcactcacYaattcttaBtgacttaaacct -gYgaWatgBttcttVttcgttWttMcNYgtaaaatctYgMgaaattacNgaHgaacDVVM -tttggtHtctaaRgtacagacgHtVtaBMNBgattagcttaRcttacaHcRctgttcaaD -BggttKaacatgKtttYataVaNattccgMcgcgtagtRaVVaattaKaatggttRgaMc -agtatcWBttNtHagctaatctagaaNaaacaYBctatcgcVctBtgcaaagDgttVtga -HtactSNYtaaNccatgtgDacgaVtDcgKaRtacDcttgctaagggcagMDagggtBWR -tttSgccttttttaacgtcHctaVtVDtagatcaNMaVtcVacatHctDWNaataRgcgt -aVHaggtaaaaSgtttMtattDgBtctgatSgtRagagYtctSaKWaataMgattRKtaa -catttYcgtaacacattRWtBtcggtaaatMtaaacBatttctKagtcDtttgcBtKYYB -aKttctVttgttaDtgattttcttccacttgSaaacggaaaNDaattcYNNaWcgaaYat -tttMgcBtcatRtgtaaagatgaWtgaccaYBHgaatagataVVtHtttVgYBtMctaMt -cctgaDcYttgtccaaaRNtacagcMctKaaaggatttacatgtttaaWSaYaKttBtag -DacactagctMtttNaKtctttcNcSattNacttggaacaatDagtattRtgSHaataat -gccVgacccgatactatccctgtRctttgagaSgatcatatcgDcagWaaHSgctYYWta -tHttggttctttatVattatcgactaagtgtagcatVgtgHMtttgtttcgttaKattcM -atttgtttWcaaStNatgtHcaaaDtaagBaKBtRgaBgDtSagtatMtaacYaatYtVc -KatgtgcaacVaaaatactKcRgtaYtgtNgBBNcKtcttaccttKgaRaYcaNKtactt -tgagSBtgtRagaNgcaaaNcacagtVtttHWatgttaNatBgtttaatNgVtctgaata -tcaRtattcttttttttRaaKcRStctcggDgKagattaMaaaKtcaHacttaataataK -taRgDtKVBttttcgtKaggHHcatgttagHggttNctcgtatKKagVagRaaaggaaBt -NatttVKcRttaHctaHtcaaatgtaggHccaBataNaNaggttgcWaatctgatYcaaa -HaatWtaVgaaBttagtaagaKKtaaaKtRHatMaDBtBctagcatWtatttgWttVaaa -ScMNattRactttgtYtttaaaagtaagtMtaMaSttMBtatgaBtttaKtgaatgagYg -tNNacMtcNRacMMHcttWtgtRtctttaacaacattattcYaMagBaacYttMatcttK -cRMtgMNccattaRttNatHaHNaSaaHMacacaVaatacaKaSttHatattMtVatWga -ttttttaYctttKttHgScWaacgHtttcaVaaMgaacagNatcgttaacaaaaagtaca -HBNaattgttKtcttVttaaBtctgctacgBgcWtttcaggacacatMgacatcccagcg -gMgaVKaBattgacttaatgacacacaaaaaatRKaaBctacgtRaDcgtagcVBaacDS -BHaaaaSacatatacagacRNatcttNaaVtaaaataHattagtaaaaSWccgtatWatg -gDttaactattgcccatcttHaSgYataBttBaactattBtcHtgatcaataSttaBtat -KSHYttWggtcYtttBttaataccRgVatStaHaKagaatNtagRMNgtcttYaaSaact -cagDSgagaaYtMttDtMRVgWKWtgMaKtKaDttttgactatacataatcNtatNaHat -tVagacgYgatatatttttgtStWaaatctWaMgagaRttRatacgStgattcttaagaD -taWccaaatRcagcagaaNKagtaaDggcgccBtYtagSBMtactaaataMataBSacRM -gDgattMMgtcHtcaYDtRaDaacggttDaggcMtttatgttaNctaattaVacgaaMMt -aatDccSgtattgaRtWWaccaccgagtactMcgVNgctDctaMScatagcgtcaactat -acRacgHRttgctatttaatgaattataYKttgtaagWgtYttgcHgMtaMattWaWVta -RgcttgYgttBHtYataSccStBtgtagMgtDtggcVaaSBaatagDttgBgtctttctc -attttaNagtHKtaMWcYactVcgcgtatMVtttRacVagDaatcttgctBBcRDgcaac -KttgatSKtYtagBMagaRtcgBattHcBWcaactgatttaatttWDccatttatcgagS -KaWttataHactaHMttaatHtggaHtHagaatgtKtaaRactgtttMatacgatcaagD -gatKaDctataMggtHDtggHacctttRtatcttYattttgacttgaaSaataaatYcgB -aaaaccgNatVBttMacHaKaataagtatKgtcaagactcttaHttcggaattgttDtct -aaccHttttWaaatgaaatataaaWattccYDtKtaaaacggtgaggWVtctattagtga -ctattaagtMgtttaagcatttgSgaaatatccHaaggMaaaattttcWtatKctagDtY -tMcctagagHcactttactatacaaacattaacttaHatcVMYattYgVgtMttaaRtga -aataaDatcaHgtHHatKcDYaatcttMtNcgatYatgSaMaNtcttKcWataScKggta -tcttacgcttWaaagNatgMgHtctttNtaacVtgttcMaaRatccggggactcMtttaY -MtcWRgNctgNccKatcttgYDcMgattNYaRagatHaaHgKctcataRDttacatBatc -cattgDWttatttaWgtcggagaaaaatacaatacSNtgggtttccttacSMaagBatta -caMaNcactMttatgaRBacYcYtcaaaWtagctSaacttWgDMHgaggatgBVgcHaDt -ggaactttggtcNatNgtaKaBcccaNtaagttBaacagtatacDYttcctNgWgcgSMc -acatStctHatgRcNcgtacacaatRttMggaNKKggataaaSaYcMVcMgtaMaHtgat -tYMatYcggtcttcctHtcDccgtgRatcattgcgccgatatMaaYaataaYSggatagc -gcBtNtaaaScaKgttBgagVagttaKagagtatVaactaSacWactSaKatWccaKaaa -atBKgaaKtDMattttgtaaatcRctMatcaaMagMttDgVatggMaaWgttcgaWatga -aatttgRtYtattaWHKcRgctacatKttctaccaaHttRatctaYattaaWatVNccat -NgagtcKttKataStRaatatattcctRWatDctVagttYDgSBaatYgttttgtVaatt -taatagcagMatRaacttBctattgtMagagattaaactaMatVtHtaaatctRgaaaaa -aaatttWacaacaYccYDSaattMatgaccKtaBKWBattgtcaagcHKaagttMMtaat -ttcKcMagNaaKagattggMagaggtaatttYacatcWaaDgatMgKHacMacgcVaaca -DtaDatatYggttBcgtatgWgaSatttgtagaHYRVacaRtctHaaRtatgaactaata -tctSSBgggaaHMWtcaagatKgagtDaSatagttgattVRatNtctMtcSaagaSHaat -aNataataRaaRgattctttaataaagWaRHcYgcatgtWRcttgaaggaMcaataBRaa -ccagStaaacNtttcaatataYtaatatgHaDgcStcWttaacctaRgtYaRtataKtgM -ttttatgactaaaatttacYatcccRWtttHRtattaaatgtttatatttgttYaatMca -RcSVaaDatcgtaYMcatgtagacatgaaattgRtcaaYaaYtRBatKacttataccaNa -aattVaBtctggacaagKaaYaaatatWtMtatcYaaVNtcgHaactBaagKcHgtctac -aatWtaDtSgtaHcataHtactgataNctRgttMtDcDttatHtcgtacatcccaggStt -aBgtcacacWtccNMcNatMVaVgtccDYStatMaccDatggYaRKaaagataRatttHK -tSaaatDgataaacttaHgttgVBtcttVttHgDacgaKatgtatatNYataactctSat -atatattgcHRRYttStggaactHgttttYtttaWtatMcttttctatctDtagVHYgMR -BgtHttcctaatYRttKtaagatggaVRataKDctaMtKBNtMtHNtWtttYcVtattMc -gRaacMcctNSctcatttaaagDcaHtYccSgatgcaatYaaaaDcttcgtaWtaattct -cgttttScttggtaatctttYgtctaactKataHacctMctcttacHtKataacacagcN -RatgKatttttSaaatRYcgDttaMRcgaaattactMtgcgtaagcgttatBtttttaat -taagtNacatHgttcRgacKcBBtVgatKttcgaBaatactDRgtRtgaNacWtcacYtt -aaKcgttctHaKttaNaMgWgWaggtctRgaKgWttSttBtDcNtgtttacaaatYcDRt -gVtgcctattcNtctaaaDMNttttNtggctgagaVctDaacVtWccaagtaacacaNct -gaScattccDHcVBatcgatgtMtaatBgHaatDctMYgagaatgYWKcctaatNaStHa -aaKccgHgcgtYaaYtattgtStgtgcaaRtattaKatattagaWVtcaMtBagttatta -gNaWHcVgcaattttDcMtgtaRHVYtHtctgtaaaaHVtMKacatcgNaatttMatatg -ttgttactagWYtaRacgataKagYNKcattataNaRtgaacKaYgcaaYYacaNccHat -MatDcNgtHttRaWttagaaDcaaaaaatagggtKDtStaDaRtaVtHWKNtgtattVct -SVgRgataDaRaWataBgaagaaKtaataaYgDcaStaNgtaDaaggtattHaRaWMYaY -aWtggttHYgagVtgtgcttttcaaDKcagVcgttagacNaaWtagtaataDttctggtt -VcatcataaagtgKaaaNaMtaBBaattaatWaattgctHaVKaSgDaaVKaHtatatat -HatcatSBagNgHtatcHYMHgttDgtaHtBttWatcgtttaRaattgStKgSKNWKatc -agDtctcagatttctRtYtBatBgHHtKaWtgYBgacVVWaKtacKcDttKMaKaVcggt -gttataagaataaHaatattagtataatMHgttYgaRttagtaRtcaaVatacggtcMcg -agtaaRttacWgactKRYataaaagSattYaWgagatYagKagatgSaagKgttaatMgg -tataatgttWYttatgagaaacctNVataatHcccKtDctcctaatactggctHggaSag -gRtKHaWaattcgSatMatttagaggcYtctaMcgctcataSatatgRagacNaaDagga -VBagaYttKtacNaKgtSYtagttggaWcatcWttaatctatgaVtcgtgtMtatcaYcg -tRccaaYgDctgcMgtgtWgacWtgataacacgcgctBtgttaKtYDtatDcatcagKaV -MctaatcttgVcaaRgcRMtDcgattaHttcaNatgaatMtactacVgtRgatggaWttt -actaaKatgagSaaKggtaNtactVaYtaaKRagaacccacaMtaaMtKtatBcttgtaa -WBtMctaataaVcDaaYtcRHBtcgttNtaaHatttBNgRStVDattBatVtaagttaYa -tVattaagaBcacggtSgtVtatttaRattgatgtaHDKgcaatattKtggcctatgaWD -KRYcggattgRctatNgatacaatMNttctgtcRBYRaaaHctNYattcHtaWcaattct -BtMKtVgYataatMgYtcagcttMDataVtggRtKtgaatgccNcRttcaMtRgattaac -attRcagcctHtWMtgtDRagaKaBtgDttYaaaaKatKgatctVaaYaacWcgcatagB -VtaNtRtYRaggBaaBtgKgttacataagagcatgtRattccacttaccatRaaatgWgD -aMHaYVgVtaSctatcgKaatatattaDgacccYagtgtaYNaaatKcagtBRgagtcca -tgKgaaaccBgaagBtgSttWtacgatWHaYatcgatttRaaNRgcaNaKVacaNtDgat -tgHVaatcDaagcgtatgcNttaDataatcSataaKcaataaHWataBtttatBtcaKtK -tatagttaDgSaYctacaRatNtaWctSaatatttYaKaKtaccWtatcRagacttaYtt -VcKgSDcgagaagatccHtaattctSttatggtKYgtMaHagVaBRatttctgtRgtcta -tgggtaHKgtHacHtSYacgtacacHatacKaaBaVaccaDtatcSaataaHaagagaat -ScagactataaRttagcaaVcaHataKgDacatWccccaagcaBgagWatctaYttgaaa -tctVNcYtttWagHcgcgcDcVaaatgttKcHtNtcaatagtgtNRaactttttcaatgg -WgBcgDtgVgtttctacMtaaataaaRggaaacWaHttaRtNtgctaaRRtVBctYtVta -tDcattDtgaccYatagatYRKatNYKttNgcctagtaWtgaactaMVaacctgaStttc -tgaKVtaaVaRKDttVtVctaDNtataaaDtccccaagtWtcgatcactDgYaBcatcct -MtVtacDaaBtYtMaKNatNtcaNacgDatYcatcgcaRatWBgaacWttKttagYtaat -tcggttgSWttttDWctttacYtatatWtcatDtMgtBttgRtVDggttaacYtacgtac -atgaattgaaWcttMStaDgtatattgaDtcRBcattSgaaVBRgagccaaKtttcDgcg -aSMtatgWattaKttWtgDBMaggBBttBaatWttRtgcNtHcgttttHtKtcWtagHSt -aacagttgatatBtaWSaWggtaataaMttaKacDaatactcBttcaatatHttcBaaSa -aatYggtaRtatNtHcaatcaHtagVtgtattataNggaMtcttHtNagctaaaggtaga -YctMattNaMVNtcKtactBKcaHHcBttaSagaKacataYgctaKaYgttYcgacWVtt -WtSagcaacatcccHaccKtcttaacgaKttcacKtNtacHtatatRtaaatacactaBt -ttgaHaRttggttWtatYagcatYDatcggagagcWBataagRtacctataRKgtBgatg -aDatataSttagBaHtaatNtaDWcWtgtaattacagKttcNtMagtattaNgtctcgtc -ctcttBaHaKcKccgtRcaaYagSattaagtKataDatatatagtcDtaacaWHcaKttD -gaaRcgtgYttgtcatatNtatttttatggccHtgDtYHtWgttatYaacaattcaWtat -NgctcaaaSttRgctaatcaaatNatcgtttaBtNNVtgttataagcaaagattBacgtD -atttNatttaaaDcBgtaSKgacgtagataatttcHMVNttgttBtDtgtaWKaaRMcKM -tHtaVtagataWctccNNaSWtVaHatctcMgggDgtNHtDaDttatatVWttgttattt -aacctttcacaaggaSaDcggttttttatatVtctgVtaacaStDVaKactaMtttaSNa -gtgaaattaNacttSKctattcctctaSagKcaVttaagNaVcttaVaaRNaHaaHttat -gtHttgtgatMccaggtaDcgaccgtWgtWMtttaHcRtattgScctatttKtaaccaag -tYagaHgtWcHaatgccKNRtttagtMYSgaDatctgtgaWDtccMNcgHgcaaacNDaa -aRaStDWtcaaaaHKtaNBctagBtgtattaactaattttVctagaatggcWSatMaccc -ttHttaSgSgtgMRcatRVKtatctgaaaccDNatYgaaVHNgatMgHRtacttaaaRta -tStRtDtatDttYatattHggaBcttHgcgattgaKcKtttcRataMtcgaVttWacatN -catacctRataDDatVaWNcggttgaHtgtMacVtttaBHtgagVttMaataattatgtt -cttagtttgtgcDtSatttgBtcaacHattaaBagVWcgcaSYttMgcttacYKtVtatc -aYaKctgBatgcgggcYcaaaaacgNtctagKBtattatctttKtaVttatagtaYtRag -NtaYataaVtgaatatcHgcaaRataHtacacatgtaNtgtcgYatWMatttgaactacR -ctaWtWtatacaatctBatatgYtaagtatgtgtatSttactVatcttYtaBcKgRaSgg -RaaaaatgcagtaaaWgtaRgcgataatcBaataccgtatttttccatcNHtatWYgatH -SaaaDHttgctgtccHtggggcctaataatttttctatattYWtcattBtgBRcVttaVM -RSgctaatMagtYtttaaaaatBRtcBttcaaVtaacagctccSaaSttKNtHtKYcagc -agaaaccccRtttttaaDcDtaStatccaagcgctHtatcttaDRYgatDHtWcaaaBcW -gKWHttHataagHacgMNKttMKHccaYcatMVaacgttaKgYcaVaaBtacgcaacttt -MctaaHaatgtBatgagaSatgtatgSRgHgWaVWgataaatatttccKagVgataattW -aHNcYggaaatgctHtKtaDtctaaagtMaatVDVactWtSaaWaaMtaHtaSKtcBRaN -cttStggtBttacNagcatagRgtKtgcgaacaacBcgKaatgataagatgaaaattgta -ctgcgggtccHHWHaaNacaBttNKtKtcaaBatatgctaHNgtKcDWgtttatNgVDHg -accaacWctKaaggHttgaRgYaatHcaBacaatgagcaaattactgtaVaaYaDtagat -tgagNKggtggtgKtWKaatacagDRtatRaMRtgattDggtcaaYRtatttNtagaDtc -acaaSDctDtataatcgtactaHttatacaatYaacaaHttHatHtgcgatRRttNgcat -SVtacWWgaaggagtatVMaVaaattScDDKNcaYBYaDatHgtctatBagcaacaagaa -tgagaaRcataaKNaRtBDatcaaacgcattttttaaBtcSgtacaRggatgtMNaattg -gatatWtgagtattaaaVctgcaYMtatgatttttYgaHtgtcttaagWBttHttgtctt -attDtcgtatWtataataSgctaHagcDVcNtaatcaagtaBDaWaDgtttagYctaNcc -DtaKtaHcttaataacccaRKtacaVaatNgcWRaMgaattatgaBaaagattVYaHMDc -aDHtcRcgYtcttaaaWaaaVKgatacRtttRRKYgaatacaWVacVcRtatMacaBtac -tggMataaattttHggNagSctacHgtBagcgtcgtgattNtttgatSaaggMttctttc -ttNtYNagBtaaacaaatttMgaccttacataattgYtcgacBtVMctgStgMDtagtaR -ctHtatgttcatatVRNWataDKatWcgaaaaagttaaaagcacgHNacgtaatctttMR -tgacttttDacctataaacgaaatatgattagaactccSYtaBctttaataacWgaaaYa -tagatgWttcatKtNgatttttcaagHtaYgaaRaDaagtaggagcttatVtagtctttc -attaaaatcgKtattaRttacagVaDatgcatVgattgggtctttHVtagKaaRBtaHta -aggccccaaaaKatggtttaMWgtBtaaacttcactttKHtcgatctccctaYaBacMgt -cttBaBaNgcgaaacaatctagtHccHtKttcRtRVttccVctttcatacYagMVtMcag -aMaaacaataBctgYtaatRaaagattaaccatVRatHtaRagcgcaBcgDttStttttc -VtttaDtKgcaaWaaaaatSccMcVatgtKgtaKgcgatatgtagtSaaaDttatacaaa -catYaRRcVRHctKtcgacKttaaVctaDaatgttMggRcWaacttttHaDaKaDaBctg -taggcgtttaHBccatccattcNHtDaYtaataMttacggctNVaacDattgatatttta -cVttSaattacaaRtataNDgacVtgaacataVRttttaDtcaaacataYDBtttaatBa -DtttYDaDaMccMttNBttatatgagaaMgaNtattHccNataattcaHagtgaaggDga -tgtatatatgYatgaStcataaBStWacgtcccataRMaaDattggttaaattcMKtctM -acaBSactcggaatDDgatDgcWctaacaccgggaVcacWKVacggtaNatatacctMta -tgatagtgcaKagggVaDtgtaacttggagtcKatatcgMcttRaMagcattaBRaStct -YSggaHYtacaactMBaagDcaBDRaaacMYacaHaattagcattaaaHgcgctaaggSc -cKtgaaKtNaBtatDDcKBSaVtgatVYaagVtctSgMctacgttaacWaaattctSgtD -actaaStaaattgcagBBRVctaatatacctNttMcRggctttMttagacRaHcaBaacV -KgaataHttttMgYgattcYaNRgttMgcVaaacaVVcDHaatttgKtMYgtatBtVVct -WgVtatHtacaaHttcacgatagcagtaaNattBatatatttcVgaDagcggttMaagtc -ScHagaaatgcYNggcgtttttMtStggtRatctacttaaatVVtBacttHNttttaRca -aatcacagHgagagtMgatcSWaNRacagDtatactaaDKaSRtgattctccatSaaRtt -aaYctacacNtaRtaactggatgaccYtacactttaattaattgattYgttcagDtNKtt -agDttaaaaaaaBtttaaNaYWKMBaaaacVcBMtatWtgBatatgaacVtattMtYatM -NYDKNcKgDttDaVtaaaatgggatttctgtaaatWtctcWgtVVagtcgRgacttcccc -taDcacagcRcagagtgtWSatgtacatgttaaSttgtaaHcgatgggMagtgaacttat -RtttaVcaccaWaMgtactaatSSaHtcMgaaYtatcgaaggYgggcgtgaNDtgttMNg -aNDMtaattcgVttttaacatgVatgtWVMatatcaKgaaattcaBcctccWcttgaaWH -tWgHtcgNWgaRgctcBgSgaattgcaaHtgattgtgNagtDttHHgBttaaWcaaWagc -aSaHHtaaaVctRaaMagtaDaatHtDMtcVaWMtagSagcttHSattaacaaagtRacM -tRtctgttagcMtcaBatVKtKtKacgagaSNatSactgtatatcBctgagVtYactgta -aattaaaggcYgDHgtaacatSRDatMMccHatKgttaacgactKtgKagtcttcaaHRV -tccttKgtSataatttacaactggatDNgaacttcaRtVaagDcaWatcBctctHYatHa -DaaatttagYatSatccaWtttagaaatVaacBatHcatcgtacaatatcgcNYRcaata -YaRaYtgattVttgaatgaVaactcRcaNStgtgtattMtgaggtNttBaDRcgaaaagc -tNgBcWaWgtSaDcVtgVaatMKBtttcgtttctaaHctaaagYactgMtatBDtcStga -ccgtSDattYaataHctgggaYYttcggttaWaatctggtRagWMaDagtaacBccacta -cgHWMKaatgatWatcctgHcaBaSctVtcMtgtDttacctaVgatYcWaDRaaaaRtag -atcgaMagtggaRaWctctgMgcWttaagKBRtaaDaaWtctgtaagYMttactaHtaat -cttcataacggcacBtSgcgttNHtgtHccatgttttaaagtatcgaKtMttVcataYBB -aKtaMVaVgtattNDSataHcagtWMtaggtaSaaKgttgBtVtttgttatcatKcgHac -acRtctHatNVagSBgatgHtgaRaSgttRcctaacaaattDNttgacctaaYtBgaaaa -tagttattactcttttgatgtNNtVtgtatMgtcttRttcatttgatgacacttcHSaaa -ccaWWDtWagtaRDDVNacVaRatgttBccttaatHtgtaaacStcVNtcacaSRttcYa -gacagaMMttttgMcNttBcgWBtactgVtaRttctccaaYHBtaaagaBattaYacgat -ttacatctgtaaMKaRYtttttactaaVatWgctBtttDVttctggcDaHaggDaagtcg -aWcaagtagtWttHtgKtVataStccaMcWcaagataagatcactctHatgtcYgaKcat -cagatactaagNSStHcctRRNtattgtccttagttagMVgtatagactaactctVcaat -MctgtttgtgttgccttatWgtaBVtttctggMcaaKgDWtcgtaaYStgSactatttHg -atctgKagtagBtVacRaagRtMctatgggcaaaKaaaatacttcHctaRtgtDcttDat -taggaaatttcYHaRaaBttaatggcacKtgctHVcaDcaaaVDaaaVcgMttgtNagcg -taDWgtcgttaatDgKgagcSatatcSHtagtagttggtgtHaWtaHKtatagctgtVga -ttaBVaatgaataagtaatVatSttaHctttKtttgtagttaccttaatcgtagtcctgB -cgactatttVcMacHaaaggaatgDatggKtaHtgStatattaaSagctWcctccRtata -BaDYcgttgcNaagaggatRaaaYtaWgNtSMcaatttactaacatttaaWttHtatBat -tgtcgacaatNgattgcNgtMaaaKaBDattHacttggtRtttaYaacgVactBtaBaKt -gBttatgVttgtVttcaatcWcNctDBaaBgaDHacBttattNtgtDtatttVSaaacag -gatgcRatSgtaSaNtgBatagttcHBgcBBaaattaHgtDattatDaKaatBaaYaaMa -ataaataKtttYtagtBgMatNcatgtttgaNagtgttgtgKaNaSagtttgaSMaYBca -aaacDStagttVacaaaaactaaWttBaagtctgtgcgtMgtaattctcctacctcaNtt -taaccaaaaVtBcacataacaccccBcWMtatVtggaatgaWtcaaWaaaaaaaaWtDta -atatRcctDWtcctaccMtVVatKttaWaaKaaatataaagScHBagaggBaSMtaWaVt -atattactSaaaKNaactatNatccttgaYctattcaaaVgatttYHcRagattttaSat -aggttattcVtaaagaKgtattattKtRttNcggcRgtgtgtWYtaacHgKatKgatYta -cYagDtWcHBDctctgRaYKaYagcactKcacSaRtBttttBHKcMtNtcBatttatttt -tgSatVgaaagaWtcDtagDatatgMacaacRgatatatgtttgtKtNRaatatNatgYc -aHtgHataacKtgagtagtaacYttaNccaaatHcacaacaVDtagtaYtccagcattNt -acKtBtactaaagaBatVtKaaHBctgStgtBgtatgaSNtgDataaccctgtagcaBgt -gatcttaDataStgaMaccaSBBgWagtacKcgattgaDgNNaaaacacagtSatBacKD -gcgtataBKcatacactaSaatYtYcDaactHttcatRtttaatcaattataRtttgtaa -gMcgNttcatcBtYBagtNWNMtSHcattcRctttttRWgaKacKttgggagBcgttcgc -MaWHtaatactgtctctatttataVgtttaBScttttaBMaNaatMacactYtBMggtHa -cMagtaRtctgcatttaHtcaaaatttgagKtgNtactBacaHtcgtatttctMaSRagc -agttaatgtNtaaattgagagWcKtaNttagVtacgatttgaatttcgRtgtWcVatcgt -taaDVctgtttBWgaccagaaagtcSgtVtatagaBccttttcctaaattgHtatcggRa -ttttcaaggcYSKaagWaWtRactaaaacccBatMtttBaatYtaagaactSttcgaaSc -aatagtattgaccaagtgttttctaacatgtttNVaatcaaagagaaaNattaaRtttta -VaaaccgcaggNMtatattVctcaagaggaacgBgtttaacaagttcKcYaatatactaa -ccBaaaSggttcNtattctagttRtBacgScVctcaatttaatYtaaaaaaatgSaatga -tagaMBRatgRcMcgttgaWHtcaVYgaatYtaatctttYttatRaWtctgBtDcgatNa -tcKaBaDgatgtaNatWKctccgatattaacattNaaacDatgBgttctgtDtaaaMggt -gaBaSHataacgccSctaBtttaRBtcNHcDatcDcctagagtcRtaBgWttDRVHagat -tYatgtatcWtaHtttYcattWtaaagtctNgtStggRNcgcggagSSaaagaaaatYcH -DtcgctttaatgYcKBVSgtattRaYBaDaaatBgtatgaHtaaRaRgcaSWNtagatHa -acttNctBtcaccatctMcatattccaSatttgcgaDagDgtatYtaaaVDtaagtttWV -aagtagYatRttaagDcNgacKBcScagHtattatcDaDactaaaaaYgHttBcgaDttg -gataaaKSRcBMaBcgaBSttcWtgNBatRaccgattcatttataacggHVtaattcaca -agagVttaaRaatVVRKcgWtVgacctgDgYaaHaWtctttcacMagggatVgactagMa -aataKaaNWagKatagNaaWtaaaatttgaattttatttgctaaVgaHatBatcaaBWcB -gttcMatcgBaaNgttcgSNaggSaRtttgHtRtattaNttcDcatSaVttttcgaaaaa -ttgHatctaRaggSaNatMDaaatDcacgattttagaHgHaWtYgattaatHNSttatMS -gggNtcKtYatRggtttgtMWVtttaYtagcagBagHaYagttatatggtBacYcattaR -SataBatMtttaaatctHcaaaSaaaagttNSaaWcWRccRtKaagtBWtcaaattSttM -tattggaaaccttaacgttBtWatttatatWcDaatagattcctScacctaagggRaaYt -aNaatgVtBcttaaBaacaMVaaattatStYgRcctgtactatcMcVKatttcgSgatRH -MaaaHtagtaaHtVgcaaataatatcgKKtgccaatBNgaaWcVttgagttaKatagttc -aggKDatDtattgaKaVcaKtaataDataataHSaHcattagttaatRVYcNaHtaRcaa -ggtNHcgtcaaccaBaaagYtHWaaaRcKgaYaaDttgcWYtataRgaatatgtYtgcKt -aNttWacatYHctRaDtYtattcBttttatcSataYaYgttWaRagcacHMgtttHtYtt -YaatcggtatStttcgtRSattaaDaKMaatatactaNBaWgctacacYtgaYVgtgHta -aaRaaRgHtagtWattataaaSDaaWtgMattatcgaaaagtaYRSaWtSgNtBgagcRY -aMDtactaacttaWgtatctagacaagNtattHggataatYttYatcataDcgHgttBtt -ctttVttgccgaaWtaaaacgKgtatctaaaaaNtccDtaDatBMaMggaatNKtatBaa -atVtccRaHtaSacataHattgtttKVYattcataVaattWtcgtgMttcttKtgtctaa -cVtatctatatBRataactcgKatStatattcatHHRttKtccaacgtgggtgRgtgaMt -attattggctatcgtgacMtRcBDtcttgtactaatRHttttaagatcgVMDStattatY -BtttDttgtBtNttgRcMtYtgBacHaWaBaatDKctaagtgaaactaatgRaaKgatcc -aagNaaaatattaggWNtaagtatacttttKcgtcggSYtcttgRctataYcttatataa -agtatattaatttataVaacacaDHatctatttttKYVatHRactttaBHccaWagtact -BtcacgaVgcgttRtttttttSVgtSagtBaaattctgaHgactcttgMcattttagVta -agaattHctHtcaDaaNtaacRggWatagttcgtSttgaDatcNgNagctagDgatcNtt -KgttgtaDtctttRaaYStRatDtgMggactSttaDtagSaVtBDttgtDgccatcacaM -attaaaMtNacaVcgSWcVaaDatcaHaatgaattaMtatccVtctBtaattgtWattat -BRcWcaatgNNtactWYtDaKttaaatcactcagtRaaRgatggtKgcgccaaHgaggat -StattYcaNMtcaBttacttatgagDaNtaMgaaWtgtttcttctaHtMNgttatctaWW -atMtBtaaatagDVatgtBYtatcggcttaagacMRtaHScgatatYgRDtcattatSDa -HggaaataNgaWSRRaaaBaatagBattaDctttgHWNttacaataaaaaaatacggttt -gHgVtaHtWMttNtBtctagtMcgKMgHgYtataHaNagWtcaacYattaataYRgtaWK -gaBctataaccgatttaHaNBRaRaMtccggtNgacMtctcatttgcaattcWgMactta -caaDaaNtactWatVtttagccttMaatcagVaagtctVaaDaBtattaattaYtNaYtg -gattaKtaKctYaMtattYgatattataatKtVgDcttatatNBtcgttgtStttttMag -aggttaHYSttcKgtcKtDNtataagttataagSgttatDtRttattgttttSNggRtca -aKMNatgaatattgtBWtaMacctgggYgaSgaagYataagattacgagaatBtggtRcV -HtgYggaDgaYaKagWagctatagacgaaHgtWaNgacttHRatVaWacKYtgRVNgVcS -gRWctacatcKSactctgWYtBggtataagcttNRttVtgRcaWaaatDMatYattaact -ttcgaagRatSctgccttgcRKaccHtttSNVagtagHagBagttagaccaRtataBcca -taatSHatRtcHagacBWatagcaMtacaRtgtgaaBatctKRtScttccaNaatcNgta -atatWtcaMgactctBtWtaaNactHaaaaRctcgcatggctMcaaNtcagaaaaacaca -gtggggWttRttagtaagaVctVMtcgaatcttcMaaaHcaHBttcgattatgtcaDagc -YRtBtYcgacMgtDcagcgaNgttaataatagcagKYYtcgtaBtYctMaRtaRtDagaa -aacacatgYaBttgattattcgaaNttBctSataaMataWRgaHtttccgtDgaYtatgg -tDgHKgMtatttVtMtVagttaRatMattRagataaccctKctMtSttgaHagtcStcta -tttccSagatgttccacgaggYNttHRacgattcDatatDcataaaatBBttatcgaHtN -HaaatatDNaggctgaNcaaggagttBttMgRagVatBcRtaWgatgBtSgaKtcgHttt -gaatcaaDaHttcSBgHcagtVaaSttDcagccgttNBtgttHagYtattctttRWaaVt -SttcatatKaaRaaaNacaVtVctMtSDtDtRHRcgtaatgctcttaaatSacacaatcg -HattcaWcttaaaatHaaatcNctWttaNMcMtaKctVtcctaagYgatgatcYaaaRac -tctaRDaYagtaacgtDgaggaaatctcaaacatcaScttcKttNtaccatNtaNataca -tttHaaDHgcaDatMWaaBttcRggctMaagctVYcacgatcaDttatYtaatcKatWat -caatVYtNagatttgattgaYttttYgacttVtcKaRagaaaHVgDtaMatKYagagttN -atWttaccNtYtcDWgSatgaRgtMatgKtcgacaagWtacttaagtcgKtgatccttNc -ttatagMatHVggtagcgHctatagccctYttggtaattKNaacgaaYatatVctaataM -aaaYtgVtcKaYtaataacagaatHcacVagatYWHttagaaSMaatWtYtgtaaagNaa -acaVgaWtcacNWgataNttcaSagctMDaRttgNactaccgataMaaatgtttattDtc -aagacgctDHYYatggttcaagccNctccttcMctttagacBtaaWtaWVHggaaaaNat -ttaDtDtgctaaHHtMtatNtMtagtcatttgcaaaRatacagRHtatDNtgtDgaatVg -tVNtcaaatYBMaaaagcaKgtgatgatMgWWMaHttttMgMagatDtataaattaacca -actMtacataaattgRataatacgBtKtaataattRgtatDagDtcRDacctatRcagag -cSHatNtcaScNtttggacNtaaggaccgtgKNttgttNcttgaaRgYgRtNtcagttBc -ttttcHtKtgcttYaaNgYagtaaatgaatggWaMattBHtatctatSgtcYtgcHtaat -tHgaaMtHcagaaSatggtatgccaHBtYtcNattWtgtNgctttaggtttgtWatNtgH -tgcDttactttttttgcNtactKtWRaVcttcatagtgSNKaNccgaataaBttataata -YtSagctttaaatSttggctaaKSaatRccgWHgagDttaaatcatgagMtcgagtVtaD -ggaBtatttgDacataaacgtagYRagBWtgDStKDgatgaagttcattatttaKWcata -aatWRgatataRgttRacaaNKttNtKagaaYaStaactScattattaacgatttaaatg -DtaattagatHgaYataaactatggggatVHtgccgtNgatNYcaStRtagaccacWcaM -tatRagHgVactYtWHtcttcatgatWgagaKggagtatgaWtDtVtNaNtcgYYgtaaa -ctttaDtBactagtaDctatagtaatatttatatataacgHaaaRagKattSagttYtSt ->THREE Homo sapiens frequency -agagagacgatgaaaattaatcgtcaatacgctggcgaacactgagggggacccaatgct -cttctcggtctaaaaaggaatgtgtcagaaattggtcagttcaaaagtagaccggatctt -tgcggagaacaattcacggaacgtagcgttgggaaatatcctttctaccacacatcggat -tttcgccctctcccattatttattgtgttctcacatagaattattgtttagacatccctc -gttgtatggagagttgcccgagcgtaaaggcataatccatataccgccgggtgagtgacc -tgaaattgtttttagttgggatttcgctatggattagcttacacgaagagattctaatgg -tactataggataattataatgctgcgtggcgcagtacaccgttacaaacgtcgttcgcat -atgtggctaacacggtgaaaatacctacatcgtatttgcaatttcggtcgtttcatagag -cgcattgaattactcaaaaattatatatgttgattatttgattagactgcgtggaaagaa -ggggtactcaagccatttgtaaaagctgcatctcgcttaagtttgagagcttacattagt -ctatttcagtcttctaggaaatgtctgtgtgagtggttgtcgtccataggtcactggcat -atgcgattcatgacatgctaaactaagaaagtagattactattaccggcatgcctaatgc -gattgcactgctatgaaggtgcggacgtcgcgcccatgtagccctgataataccaatact -tacatttggtcagcaattctgacattatacctagcacccataaatttactcagacttgag -gacaggctcttggagtcgatcttctgtttgtatgcatgtgatcatatagatgaataagcg -atgcgactagttagggcatagtatagatctgtgtatacagttcagctgaacgtccgcgag -tggaagtacagctgagatctatcctaaaatgcaaccatatcgttcacacatgatatgaac -ccagggggaaacattgagttcagttaaattggcagcgaatcccccaagaagaaggcggag -tgacgttgaacgggcttatggtttttcagtacttcctccgtataagttgagcgaaatgta -aacagaataatcgttgtgttaacaacattaaaatcgcggaatatgatgagaatacacagt -gtgagcatttcacttgtaaaatatctttggtagaacttactttgctttaaatatgttaaa -ccgatctaataatctacaaaacggtagattttgcctagcacattgcgtccttctctattc -agatagaggcaatactcagaaggttttatccaaagcactgtgttgactaacctaagtttt -agtctaataatcatgattgattataggtgccgtggactacatgactcgtccacaaataat -acttagcagatcagcaattggccaagcacccgacttttatttaatggttgtgcaatagtc -cagattcgtattcgggactctttcaaataatagtttcctggcatctaagtaagaaaagct -cataaggaagcgatattatgacacgctcttccgccgctgttttgaaacttgagtattgct -cgtccgaaattgagggtcacttcaaaatttactgagaagacgaagatcgactaaagttaa -aatgctagtccacagttggtcaagttgaattcatccacgagttatatagctattttaatt -tatagtcgagtgtacaaaaaacatccacaataagatttatcttagaataacaacccccgt -atcatcgaaatcctccgttatggcctgactcctcgagcttatagcatttgtgctggcgct -cttgccaggaacttgctcgcgaggtggtgacgagtgagatgatcagtttcattatgatga -tacgattttatcgcgactagttaatcatcatagcaagtaaaatttgaattatgtcattat -catgctccattaacaggttatttaattgatactgacgaaattttttcacaatgggttttc -tagaatttaatatcagtaattgaagccttcataggggtcctactagtatcctacacgacg -caggtccgcagtatcctggagggacgtgttactgattaaaagggtcaaaggaatgaaggc -tcacaatgttacctgcttcaccatagtgagccgatgagttttacattagtactaaatccc -aaatcatactttacgatgaggcttgctagcgctaaagagaatacatacaccaccacatag -aattgttagcgatgatatcaaatagactcctggaagtgtcagggggaaactgttcaatat -ttcgtccacaggactgaccaggcatggaaaagactgacgttggaaactataccatctcac -gcccgacgcttcactaattgatgatccaaaaaatatagcccggattcctgattagcaaag -ggttcacagagaaagatattatcgacgtatatcccaaaaaacagacgtaatgtgcatctt -cgaatcgggatgaatacttgtatcataaaaatgtgacctctagtatacaggttaatgtta -gtgatacacaatactcgtgggccatgggttctcaaataaaatgtaatattgcgtcgatca -ctcacccacgtatttggtctaattatgttttatttagtgacaatccaatagataaccggt -cctattaagggctatatttttagcgaccacgcgtttaaacaaaggattgtatgtagatgg -taccagtttaattgccagtgggcaatcctaagcaaaatgagattctatcctaaagtttgg -gcttgatataagatttcggatgtatgggttttataatcgttggagagctcaatcatgagc -taatacatggatttcgctacctcaccgagagaccttgcatgaagaattctaaccaaaagt -ttaataggccggattggattgagttaattaagaccttgttcagtcatagtaaaaaccctt -aaattttaccgattgacaaagtgagcagtcgcaataccctatgcgaaacgcctcgatagt -gactaggtatacaaggtttttgagttcctttgaaatagttaactaatttaaaattaatta -acgacatggaaatcacagaacctaatgctttgtaggagttatttatgctgtttactgcct -ctacaaccctaataaagcagtcctaagaatgaaacgcatcttttagttcagaaagtggta -tccagggtggtcaatttaataaattcaacatcgggtctcaggatattcggtcatataatt -tattaagggctcttcgagtcttactctgagtgaaattggaaacagtcatccttttcgttg -tgaggcatcttacaccgctatcgatatacaatgcattccaccgcggtgtcccgtacacaa -ggaaacttgttaccttggggatataagaaaactcacacgtctcattattaaactgagtac -aatttttgcacgagaaagtaatgcaatacaatatgatgaaagccagctaatgaaaaggga -tggaacgcacctcggatctgttgcactggattaaaatccgattatttttaaaaatattca -gtgctagagcatatcaggtctacttttttatctggtatgtaaagcccacggagcgatagt -gagatccttacgactcaacgaaaagttataacataactcccgttagccaaagcccaatcc -cgattactgccctaccctaacgtctgccatctaaatatcgaacttgttatgatcaatgtg -actacctcccaccctttccccttcatttgttccactggggataagctagcgttttcagaa -tcaatgcaataagaatagccaattgtctcacttcatcagagctcttggcaattccaggcg -ctacgtggttctggaatatattcatttttcaaatagtaatacgtttagtgttgctattgt -ctacacgtttggatattacgttatgtgagcggacatcaatagttgtctaactctttagta -agccagagatagcactcttagcgaatggataccatcttccataagtttagttaatagtcc -gaaacaactgcttcgagcatatttgaacctccttgtaggcaaatagcctcttcaaagcaa -tcttactaatagatagagtttgttttaagggactactagaaatgggacaatcttaatagt -atgacctaaactgacatttaaagatatatccaggtggcaagcataaagatcattgcgcca -cctccaccgtgggattacttatcagtcgatatcctatatgctaagtttgcgacggcagaa -tacaaactaagctgagttgatgctaaccttacctatgataccccattggaccggttaaca -gccctacttattccaaataaaagaacttttatgctgtagaagctattatagtgatgcctg -gtaacttcagtatattaaaatgacacacatacgccatatagagctcctggaactttgaat -aatgagcgaacttcgaagttgaagagcaagaaaccatatgtcacggttgcctaaagcccg -gtaaccagacatgtgctatcattgatcattatcgaggttttcataaccttgacccattat -cggctgtgcgcggacaagtacttaaatcactagtttcttcacctgcttatcggtaagaaa -taaggttggcaaagaatcgcataagacggacgtagagccgcagcgttgtgcgagtccagg -tgcatgcgcagcaataggattttaaattttgttccatttttaatttagccgtaaggatgt -ccgtaaatgattgaaaattggattcaatctttgggcctatgctactggaacctgatcgac -aaaatttcaaacatacgttaactccgaaagaccgtatttttgcggctagaatagtcagtc -gcttggagccatataccttaccacttaaacgacgtgctcctgtagttgaaatataaacag -aacacaaagactaccgatcatatcaactgaagatctttgtaactttgaggcgaagcaccc -tcttcgagacaactaagagtaaagtaccgggcgccgcaaggagtcgattgggaccctaaa -tcttgacgaattgctaagaggctcagagctaccactgtaatttctctagagcccataata -aatgaacgatacatccgtaggtagcacctaagggattataatggaagccaaatgcagtta -ataatattatatactggcgtacacgattcgacggatctctcacatagtgattcacgaccc -ccccctttgattgacacagcgtcagcattttgcaagaacgatcttctgcatagggtgcgc -caccgtaaggatgacgtcgaagctacaactgggtataatttaccatgcttccctgatgct -gagtgcaatacactaagaatgagtttttaccccatatcaccagtatttgttctgttattg -cgaagaaatggctatgctgagttggcgactaaagtcacccatcctttttattaggtaacc -ccctcccttaaactaactgatttgctggagctgccctgcatacatatactttatcattta -tggacgtccgtgacgcttattatccaccatagtcgatatgctacacggattcattaatgg -atcgtaggagtttaagttatatttactaagatcggtctcggctactatcccgccttaccc -ggcgctatttacggccatttttaatatattgacggtaattattcctatggtttcgaccgc -acgtccttggacaagaaagaatggcaaaaaaaatgtaaaagaaaaaaaatattgagtccc -taccatcatataaaaaatatgtgatgagtaacttgacgaaatgttagtggttattaaaga -ctatctattacaccttttgttttctgtcgtagtatattaaagtctagaagccttacagga -aaatcagggttatacagccgatactccgcagcatgaatcatcgaggaggtgtcctaccat -cgcgccttgtaatcttgtctgtgtatactgtatttagaccttttatacaaagtaaatatc -tcggctttatgtgattgggaggggcctactcaaacatgatgacttgacctaataatcact -gtgcgggcgtcttatgactagctattccttgaaatccaccaccaaatggttaatatgtaa -aaactttgacgatgaaacaaggtgaatgtgtagttactttgtgtaattagctgcgtcgag -cattgcttgtaaaaccgtcaatcgcacacgttacttccataaaatttctacgaatacacc -cttcttaaaaaaaacgtaggaattcacgagtttaacaaacgataactgtataaagtggaa -gtccgaagaaagcagatgcccgaactactcgaagatgtttcgttttcttaaccatagggg -cttcttaatggcccactacgcacattttgttcaagcccgagagggacatccccattacgg -gagtattactaaaactgttccgtaatacgttcagcaagggatgaaaaaggccactgctca -agttattgacgtgggagtattacatcggaagcctgaatcccacactatgatggtctgtac -aggcctagggactgcgtctagacggtattaccggcttctaatcatacgatcgtgagtctt -aacgggaagtaaggctcacacctaccccaaaccatttatctatgtaagtataaaattgtg -cgtaagtgttcaaagtggacaataaagacgtggcaaaaacccccgcacataagccgcttt -agatttcacaaataccaatgcggttaaaaacatccttgagtcgtacatacaccatactcg -cgttaaacggatataacagaagataataaatccggatgtggagtcggtgtaactatagaa -agccaagtgaaataatgcttaccagtcatttagctatacggctttcatttcatgtcaaga -gggtggagtttgacctgtacagttgatatatcaccgatacttagaactcacctaaagcta -aaattgctcgcagcgtgtaatccgcatattacaaacaatagatgggattcattatacata -agacacgatgatctgctttttcaggttgcgagatgttgcctatcgtcaatcgagtcctgc -cttacaccacttaaacaaaagtattgacagggaacctattttcgaggtattatatagtcc -agcttgaatatcaatttgacagttaacctagtgaaaatcagtaagaggaaatacgccaca -ttctccagtgaaattctacgggttatcgtctagtccaactatcaattataactcacgaga -tataagtaaattctcgtacttggcctgatttttattatactttggatccttagtaaacag -gaagggagaaaccttcaacgaaaaacactggattttgttttactctcaaagctcttatat -gacggaaataccctgtcaagtcttaactttattactagactaatgaaatgggcttggggt -ggccagaatcatagtacaatttagcggatacactattcggactttcctatcggctgtctg -gttggataagtatggggactaataggctagacatacctatacttaaactatacaggcgtc -atctatctctgcaactttggagttccctgatgttctcccgccctttgggttcacatcttc -tataccgacacccctaataacgattagtttgtgggttagagtaaattaatacggttaata -ttaatgtatcgttgaaaagctggtgtcgccaataaggtaaccggctaggcagagtatatg -tcacgaagtataactaccctaatgataagctgtaggaataaaattaatgctgtctctaag -cgaagagatatttccgactctgttttaatgacgaatctcattacttctgacttgcaaatg -ttcaatatggcacggtttcacggcacctttgtgacgcatataatgaacttagaagattat -aacgacggaactttatatgataatccgttacgattaaagaatctgttaaatatcataatg -gcattcagttctagaccgtgcatcatggtaaacttactttctctgcatggcgacatacat -ttcgctattcaaattcgcgtgtggttacacccactcgcacctttggaatattaagagaag -atgatcagaaaatccattcgctcaatttttctgacgtacgtctaatttatcctaggagac -aaatcgttttatgtctctcacatttttgaagaaaggttcgagagacaatactcaggtcct -gaactgctagaagatactcggtggagcgtggcaacaatgaaaaactcgtgacataaatga -atgatacttttccaagttcagttaagtgaatatgtttaacatacccggcttttcgatctt -aagctgacgctggacgtgcgagtaatgtcagtctcttacatacactagtgactccaagtt -tcgtcaaaaacgccccctcccttctcgagcccactcacgctatgtattgacgcgaacttg -ttcgggatcagacttttcaggagttcggtcgcgtgtccctatgtgctaatatataagtta -gatcgcattagatgctaatctgaatacttatagacgaccttcaacgagaacgggtaccac -cttgaggctagagttaggtgtgaaacgacaggtagggacatataaaatttgagtgcggct -ttagttaagggtttaattacctactcaaacatcacgctcgcgcccttcgtacgtaatcga -ccatctagaggctaaggggactgtactaggtagtgattaatgatatcctagacgcacgtg -ccttagatcttcagactctgatggtccgcgatcaccgtaattgtagtcctccaactcgat -cactttgttggcgtcaaagaaattacgatatctaaatacttataatacaataaccaagga -tgagaatgactcatcgcgttggagttatattgcttgaagttctatggaatgaaagcacgt -tatctgccgtcccaatatctccagtgagctaattcattggacggtccactttgatcaatc -cccgaggagatgttcggacactttagtctgtaacacttagcgttgagaccacgaacaatt -gattactcagtcttgaaggtgttttccaaagttcattttaaataagactacgataggcct -ttcctattgatataaactacccggctctgttgttcgtgtgagtcgtacttctctgtgttt -ttctgattatagcaagattcgattcttagtgtaaacagcgatttttatttgacccgtcaa -tgagaagcgcataggatctaagcaaaattatcaagttgtgccacaaggtaagatctttcc -agttattgcaggtaggatgtatcccacgttgatagtatgaggtctgacgtcaactgtcta -ggagagttgaccgcgtgcgggtacaccggatttgcatcgatgttgagaacgcagaactcc -cactgtcgtggcggcgttcctgatatttagcaagaggcgttgataaagccctcatcatct -agatctcgacctcatctgccctcttgctccatcattttctacacagactactttcctatc -tacgttagtataattgctttctatcttagtatcatttagagcttctccgtcaacaggttc -gtgctattaaagttagtacgaaagggacaacttgtagcaacgcatttaatcggttttcga -ctacttcgcacaaaatcagataaagaagtttgtcattctattagacattgaattgcgcaa -ttgacttgtaccacttatgatcgaacactgaatcaagactgtgattaactaaaatagaca -agccactatatcaactaataaaaacgcccctggtggtcgaacatagttgactacaggata -attaattggactggagccattacattctctacaatcgtatcacttcccaagtagacaact -ttgaccttgtagtttcatgtacaaaaaaatgctttcgcaggagcacattggtagttcaat -agtttcatgggaacctcttgagccgtcttctgtgggtgtgttcggatagtaggtactgat -aaagtcgtgtcgctttcgatgagagggaattcaccggaaaacaccttggttaacaggata -gtctatgtaaacttcgagacatgtttaagagttaccagcttaatccacggtgctctacta -gtatcatcagctgtcttgcctcgcctagaaatatgcattctatcgttatcctatcaacgg -ttgccgtactgagcagccttattgtggaagagtaatatataaatgtagtcttgtctttac -gaagcagacgtaagtaataatgacttggaataccaaaactaaacatagtggattatcata -ctcaagaactctccagataaataacagtttttacgatacgtcaccaatgagcttaaagat -taggatcctcaaaactgatacaaacgctaattcatttgttattggatccagtatcagtta -aactgaatggagtgaagattgtagaatgttgttctggcctcgcatggggtctaggtgata -tacaatttctcatacttacacggtagtggaaatctgattctagcttcgtagctgactata -ctcaaggaaccactgctcaaggtaggagactagttccgaccctacagtcaaagtggccga -agcttaaactatagactagttgttaaatgctgatttcaagatatcatctatatacagttt -ggacaattatgtgtgcgaaactaaaattcatgctattcagatggatttcacttatgcctt -agaaacagatattgcccgagctcaatcaacagttttagccggaaacaatcgaagcatagg -gacaatgtatcttttcctaaattgccatgtgcagatttctgagtgtcacgaagcgcataa -tagaatcttgtgttgcctcaactcgttgaaaagtttaaaacaatcgcagcagtctttttg -gggtctactgtgtgtttgcaaaataactgaaagaaacgcttgaacaactctgaagtagct -cgagtactcattaaagtgtaacacattagtgaatatcggccaatgaaccaaacgcttccc -ggtacgctatctctctcatcgggaggcgatgtgcaggttatctacgaaagcatcccttta -cgttgagagtgtcgatgcatgaacctcattgtaacaatagcccagcaaattctcatacgt -gcctcagggtccgggcgtactcctccatggaagggcgcgcatctagtgttataccaactc -gctttttaactactatgctgtagttctacaggcatagtggccagtattttctaacttctc -tggatagatgctctcactcctcatccatcacggcttcagtttacgtcttacttgcttgtt -cagcaacggatggaggcattaagtatcttcactgttccctaaaattgctgttcaatatca -aagtaaggacgatacagggaaagctcaagcacactcattgaatactgccccagttgcaac -ctcacttaatctgacaaaaataatgactactctaagtgttgcggaagcagtctcttccac -gagcttgtctgtatcacttcgtataggcatgtaactcgatagacacgaacaccgagtgag -aaactatattcttgcttccgtgtgtgtgacaccaggtaattgatgcggatataagctgga -gatcactcacgcccacacaaggcgctgctacctctttattccaatgtgtaagaatttgct -aacttcatttctagaccgcagctttgcggtcataatttcacggtacggacccttgggtta -gagacttgataacacacttcgcagtttccaccgcgcacatgttttagtggcttctaacat -agaatttttgttgtgacataaagagtgcgtgggagacttgcccgaccgttaagccataat -caattgaaagccccgtgagtcacatctaattggttgtactgcgcatttagctatccttta -gctgactcgaagagattcgattcctaatataggttaattagatggctgccgcgcgaagta -aaacgtgaaaaacgtagtgcgcagatctgcataactcgcgcttaattacttatgagtagt -tccaagttcgctacgttatgagagagattggaattaagcaaatatgttttatggtgattt -tgggatgagaaggactgctaagtacggctactaaacaaatttctaaaaccgccatctacc -ttatcttggagacatttaagttgtatatgtcactagtctagcttttgtctgtgggacgcg -ttctcggaatgagggaaatgcaagagccgattcatcaaatgcttatctaagaaagtagtg -gactattacaccaagcacgaatgccagggaactgctttcttgctcaggacctcgcgacaa -ggtaccccgcataagtcctagaattacatttggtcagcaatgctgacatttgaccgtgaa -aacataattttaatcagaaggcagctcacccgcttgctctagatcttatctttgtatgaa -tgtcagaatttactgcaatatccgttccgaatagtgagggcttagtatagttctctgtat -acaggtcacatcaaactccccctgtcctagtacagctctgagctttaattaattgcatac -atttccttcaatcatcagatgaaaacaccgcgaatcatgctcttctcgtatagggcaaga -gaagcaacaaacaactagcccgactcacgttcatccgccgtatccttgttcagttcttac -tccgtattaggtcagcgaaatctaatcagaataatcggtcgcgtatcaaaattaaaatcc -cgcttgaggttgacaattaaaacgctgagcagttatcggctattagatagtggggtgaaa -gtaattggctggaattatgttaaaacgtgatattaagctaaaatacgctacttgttgccg -acctaattcagtcattcgatattcagttagagccaagaataacaagcttgtataaattga -acggggtgcactaaacgatgtgttactctaatattcagcttggagtatacctgaaggcga -attcatgtatcggccaataataagacgttgaagatcacaatttggactagcaaaagaagg -tgatttatgcgtggggattgagtccactgtacgagtacggtctctggaaaattataggtt -cagggaatataaggaagtaaagataattaccaagagatttttggtatcgctatgacccag -aggtgttctaacgtctgttttgatccgcagaatttctgcctcaatgcatatttgacggac -ttgaactagagcctctaaagttaaatggcgacgcaactgttcctaaacttcaattattac -tactctttttttcctagggtattgtagaggccagtggacaaaataaatcaaatttaagat -gtttcggacattaacatcccccgtagcatagaaatcatcagttatccaatctctcatcga -gcttttacaatttctgctggcgctatggacagcatatgccgcgagacctccgcaagactc -acttgatcactgtaagtatcttcattagaggttagagcctatagttaagctgctgaccta -gtaaaattggtattttctaattttattgctcaagttaaaggttagtgaagggataatgac -gttatttttgaacaatgggttgtattcaattttatatcacgaatggaacccttcattccc -ggcataatactagacgacacgaacaagctccgatctatcagccaggcacgtgttaaggtt -taattccggcaaaccaatgaagcatcaaaaggtgacctgatgcaacttagggtcacgatg -agtttttcaggactacttattacctattaataagttaacatgagccttcataccccgtaa -gacaatacatactccaccaattagaattctgagccatcttatctttttgtatcatcgaag -ggtatggccgaataggttaattagttactcctaacgtctctacaggcatgcatttgacgc -accttcgaaaatagtcaatctctcgccacacgcgtctagtatgcagcatcaaaaatatag -tccacggtttccggattaccaaacgcggcaaagagaaacattgtatcgacggagataact -taatacagaaggaaggggcatcttcgaatacggatgaataattctatctgtttattctga -catcttgttttcaggttaatcttacgcattcaaatgacgcctgccccatgcgtgcgcaat -tattttctaatattgacgagagcaatctcactccttttgggtctatttatgttttattga -ggcacaagcctatacagaacaggtactattaaggccgtgagtgtgagactcaaaccgtgg -aaacaaaggatgggttgttcttggtacaagttttagtgcatgtgggcaatccttaccaaa -atcagatgctatccttaactttgggctgcatttaagatggcggttggaggcctgtgagaa -tcctgcgtgtcatctttaatgaccgaattcatccatgtagattcagatcacacactcatt -ccttgatgttgtctaaacaaaagttgttgtggacgcattggagggagttaagtaacaact -tgggatcgcatacttataaaaattatatgttaaactttcacaaacgctgaagtccaaagt -aactagcccaaacgcctcgagagtcactaggtattaatggtgtttgagttcctgtgaaat -agtgttcgaaggtaaaatttatgtaccaaatcgaaagaacacttaataaggcttgcttgc -acggaggtatgatgtttactgactctacaaccctaattttccagtacgtacattcattcc -aataggttagttctcaaagtgctatacaggctcctcaattgatgatatgcttcagccgct -ctatggatattagctcattttatttaggaagcccgcttagaggcttactatgagggaaat -gccaaaatgtcatacttttcggtgtgtcccatatgacaccgctttacatagaatttgaat -taaaacgcgctctcccgttcactaccatacttggtaccgtgcgcatattacatatagata -taggatcattttttaaagctgtactaggtttgatcgacaatcttatgctatactatatga -tgtaaccctcataatcaataccgatcgtacgatcctagcataggtggcaagcgattttat -gccgattattgtgttaaatagtctgtgagtgtgattatcagggctacgttggtagagggg -ttgtatagacctcgcacacattgtgacatacttaacaatatacgaaaactgatataataa -atccccttacccaaacaccaatcccgttgaatcaactaccataacgtctcccatataaat -tgcctacttgtttgcataaatctgaatacataacaccattgcaccttcttgtgttccaat -cccgttaagattgccttgtcagatgatatgcaagaacaatagcatttgctagcaattatt -aacagctcttcgaattgcctccacataacgcgggagggtatattttaatttggcaaatac -taagtactgttggcgtcatatgctattaacggttggatattaagttatgtcagccgtaag -caagagtgggcgaaatattttgttacccagtgagagcactcttagagtttggatacaata -ggccatatgttgacttaagaggacgtaactacgccgtacaccattgttcaaccgacttct -tggcaaatagaatcgtattagcaatcttaagaatagagacacgttcgtgttagggtatac -tacaaatccgaaaatcttaagaggatcacctaaactgaaatttatacatatttcaacgtg -gatagatttaacataattcagccacctccaacctgggagtaattttcagtagatttacta -gatgattagtggcccaacgcacttgactatataagatctggggatcctaacctgacctat -gagacaaaattggaaacgttaacagcccttatgtgtacaaagaaaagtaagttgttgctg -ttcaacagatgatagtcatgacgcgtaacttcactatagtaaattgaaacaaatacgcaa -tttagacagaatggtacggtcatgaatgacagtaattcgaagtgctagaccaacttaaaa -taggtaaacgtgcccgaaaccccccttaacagaaagctgctatcatggtgcagtatcgac -gtgttcagaaacttgtaacttttgagcaggtccgagcacatggaagtatatcacgtgttt -ctgaaccggcttatccctaagatatatccgtcgcaaactttcgatttagtcccacgtaga -gcccaagcgttgtgcgactccacgtgcatgcccagaaatacgagtttaaatttggttaca -tggttaattttgaccgaagcatcgcactttatgattgataattggattcaatatgtcgcc -ctatgcgaatgcaacatgatccacaatttggctataagacgtttaatccgtatcacactt -tgtttgcggctagtatagtaacgcccgtgcaccaagagtcagtaacaattataagtactc -cgcaggtacttcaaatataaaaactaatcaaacacgacccatatgatcatctgaagatat -ttggaactttctcgacaaccaccctcgtactcaatacttacactaatcgacaggcacacg -caacgtgtacagtcgcaccatattgagtcaagatttgcttagtggcgatgagcgtacacg -cttatttctctagtcacaattagttatctacgagacatcacgagggagcaaataagcgat -gttatggctacacataggcacgtatgaatatgatataagccagttaaacagtcgaaccat -cgagcaaattctcatgcaccaacccacacgttgaggcacaaagagtaagctgtttgaatg -taacttcttctgctgagcgggccccaacgtaaggatcaactagaagagaaaactcggtat -tagtttaaatgcgtcacggagcatgagtgcatttcactaagaatgtctgtgtaaccaata -taacatctatttgttatctgattgcctacttatggctttgcggtcgtggcgactaatgtc -tccaatccttttgaggtcggtaccaactccctttaaattacgctgtgcaggctcatgcac -tgcatacatatacggtagcaggtagggacctcacgcacccttattataatcaatagtagt -tatcagtcaacgaggcaggaatgctgaggtcgaggtgttggtatattttctatgtgccgt -ctaggcgactatcacgcattaccaggcgagatttaagccaattttgaatatagtcaacgt -aatttttactatgggttccaccgaaacgccttgcacaactaagaatcccataaaatatcg -atatcaaataaaagattgtgtcaataccttcatatatattttttcggttgactaacgtga -actaaggttaggggttttgtatgtctatataggaaacagtttcttttctgtcctacttta -gtaaagtcttcaagccttactccaaaatcacggtgattaagccgttactcagcagcatga -ttctgcctgctcgggtcctaaaatccagccttgtaagagtcgctgtgtattagctaggga -gacctttgttaaaaaggatatatcgcggcgggatgtgagtgcgtggcgcatactcaatct -tcagctcgtgtcattataatatctctcccccacgcttttcactagatatgccgtgtaagc -aaacaccttatgcttaatttcgaaaatattggtacttgaaaaaagctgtaggggtactta -atgtctggtaggagatcaggagagaattgagtgtaaaaccgtaaagccctcacctgactt -catgtaaatggcttagaagactccatgatttaataaatactacgaaggaaagactggatc -taaagataactctagtaaggccaactcccttcaatgctgttgccagttataatccaagag -ctgtccttttctgaaccatagcggcttctgaagcgaactagaagcaaagttggttctagc -cagacagccacataccctgtacgggtgtattactaaaactggtccggtattagttcacca -agggaggaattaggcaaaggatctaggtatgcaagtcggagtattacatccctaccctga -atccatcaataggttcctctgtactggccttcgcaatgagtattcaaggttgtacagccg -tataataataagatagtgactatgaacgggaagtaacccgctcaccttccccaaaacatt -gttatatctaagtattaaagtctgccgtagtgttaatactcgaaaataaacaactggcaa -attacaccgcacttaagccgcttttgatttatatttttccaatgcgcttttaaaaataat -tcagtcctacatactaattaagacccttaaacggagatatcacaagttaagttttaacca -tctcgactaggtggaactatagatacccaactcaatttatcattacctgtaatgttccta -gaaggattgcatttcatgtcaagacggtggagtttcacagcgaaacttcagtgtgaacag -attctgagaaatcacctaaacctattagtcagagcacccggttagaaccagttgtcaaaa -aatagagcggttgcatgagacagaagtaacgatgagatccgttgtaacgttgagacatct -ggcctatcgtcaatacagtcctcccttaaaaatatttttaaatactaggcaaacccaaca -taggttagtcctatgtgatacgccacatggtatatcattttgtaacgttacctagggata -atcaggaagtggaattacgcaaaagtagacagtgaaatgcttagggttatagtctagtcc -aaagataaaggataaagcacgtcagagaactatattagccgaatgggaatcattgttagg -agactgtggatcatgtctaaaaagcaacgcagaaacagtcatcgaaaaaatctcgttttt -gtttgaatctaaaagagctttgatgaccgatagtacctgtatactagttactgtattacg -tgtctaatgatttcggattggggtccccagaatcagacgtcattgtagacgattcaagtt -taccaatttaatttcccagctctccttggagaactatcgccaataattgcagtcactttc -cttttctgaaacgataaagccgtcagagttctctgcaacgttggacttacctgaggttct -aacccactttcggttctaatagtagttaacgacacaacgaataacctttactgtggggct -ttcacgatattttttcgcttattattaatggttacgtcataagctggtgtccaaattaag -gttaccggcttcgcagagtagttgtatccaagtataacttccctaatcataagatcgagg -tagaaaattaatgctgtctctaaccgaacagatatgtcccactatgtggtatggacgttg -ctaattacttctgaagggaaattggtcattatggatacgtgtctaccatcaggtcggacg -cagatatggttctgtcttcagttgatccaccgttctttataggataataactgacgatta -aagattatggtaaatagattaagccaattctcttcttgtcagtgaagcatccttaactga -cttgctctgcagcccctcatacatttagctattcaaagtaccggctcgtttcaaactctc -ccacctttggaagaggttgtcaacttgataagtatatcatttacagcattttttcggacg -tacctctaatgtttcattgcagaaaattagttttttctatcgcacattttgcaagtaacg -ttagagacacaattatctgcgaatgaactgctagatctgacgaccgggagcctcgcaaat -atcaaaaaagactgacatatatcaaggagtcgttgacaagtgctggtaagtcaattggtt -tatctgtcccggcgtttcgatcttaagctgaccatgcacggcagagtaatgtcactctcg -ttcttacaagtctgtctccaagggtcggcaaaaaagacccctccattctcgagcccactc -acgatatgtagggacgacaacttgtgcggcttatgaattgtctggactgcgggcgagggt -ccatatctccgaagttagaagggacatacctttagatgataagatcaattcttattgacg -aaattcatccacaacggggaacaacttcaccctagacttacgtctgaaaagacacctagc -gtcttataaaaggtcagtgccccgtttcgtaaggctggaattacctacgcaaacttaaac -ctcgcgcccttccttacgtatcgacaagatagaggctatcgcgaatgtactacggaggca -tgaatcatatactagaaccaagtgcctgtgatattaacaagatgatccgacgcgagcacc -gtaattctaggcataaaactccagcaatttgggggccgaaaacaaatgacgttagctaat -taattatatgacatgatcaaaggaggtcaatcacgcatcgagttcgacgtatattcattg -aacttcgtgcgtttgaaagaaacttttatgaaggcaaaattgatcctgtctcctatttca -tgcgtacctcctagttgataattccccgagcagtggttaggacacttttgtcggtatcaa -gttccggtctcaaaacgtaaaattctgtaatctgtatggatggtctgtgaattagttaat -ttttatgaagtcgtcgagacgcagttcctattgatttattctaaacggagatgtgcttcg -tgggactcggaagtagatctgtgtttatgattattgctactttagatgctgactgttaac -tccgtgttgtttttcaaccgtatatcacaaccgaattggatagaacctatagtttcaagt -tctgccacaaggtatcatatttacagttagtgctggttgcttctttcaaacgtggtgagt -ttgtgctatcacgtcaacggtagagctcagtggaccgagtgcgcgttcaaccctgttcca -gagagggtgtgatagcacatataccacgctcgtcgaggcgttcatgatagtttgcaagag -ccggtgttaaacacatattattattgttatccaactaatcggacctatgcataaagcatt -gtctaaacagaataattgcctatatacggtagttttagtgatttatatcttagtatcagt -tagagcttcgaactcttcaggttcctcatatttaacgttcttcgaaagcgaaaacttcta -caaacgaatgtaagcggttttccaagtagtacctataaatcacagaaagatctgtctcag -tatagttgaaatggtattcagctagtgacgtgtaccaattatcatagttcactcaagcaa -gacgctcattaacgaatatagacaagacactatatcatataataaaaaagaacatggtgc -tcgaacatagttgaattcaccatattgaaggggaatgctgacatgtaattcgctactaga -cgatcaattccctacttgtcaaagttgaactggtacgttcttggaattaaatatgattgc -gctggaccaaattgcgacttcttgagtttcagggcaaacgattgagccggaggatgtccg -tctcttacctttcttgcttatgataaacgacggtccctgtacatcactgggaattctcag -caaaaataattgggtaaatcgagactcgatgtattcggccacaaaggtgttagacgttaa -agattattcaacggggcgataataggatcataaccggtatgcaagcgcattgaaagagcc -atgagatccttatccgataaacgctgcacggtatgtgcagccttattgtcgatcacgaat -ttataaatgtagtctgggctgtaagttgaagacctaagttataatgaagtgcaataccaa -atcgattcatagtggattatcagactcaagatatctcctgataaattacagttgttaaga -tacggataaaatgagatttaagattagcagcctctaatctgtttcaatcccgttggaatg -tggtatgcgatcaaggttaagttaaaatcaagcctgtcttcagtcttgattcttgttctg -ccatcgcatgcggtctacgtgagttaatatgtagcttacgttctagcttgtgctaatctg -agtatagattcgtagaggaatattatcaagcttccacgcctcaacgtacgtgtattggtc -acacaagacactaaaagtggaagtagcgtaaactatagtctagttgttaaatgctcagtt -cttgttatattcgatatactcttggctaatttatgtctgagtatataaaattaatgatat -taacttgcatttcacggatcccttagaaaaagattttgaccgagcgcattataaacggtt -acaccgaatcaatagaagcatacccaatagctttctttgaatttattgcctgcgcaactt -ggctgactctctagatccgaataattctatatggtcgtgacgaaactagttcattactgt -ttaaaatgccaacatgtcttttgggccgataatggctctttgcaaaattactcaatgata -cgattgatcaaagcggtagttgctagtggtagcatgtaagtctatcaaatgtctgattat -ccgaaaatcttccaaaagagtccacgtaccatatctatctcatagcgacgcgaggggaac -cttatctaactatcattccatttaccgggtgactctcgatgcaggatccgattgggataa -attgcccagaaatggctcattcctgactaagggtaaggccgttctcagcaagggaacccc -gcgaatctaggcttataccatctagattgttaactacttgcctgtagttctacagccata -ctggacagttgtttctaaatgatcgggattcatgctagcactcctctgaatgcaccgcgt -aagtttaactattacgtccgtgggcagataaggatggaggctgtatgtatcttaactgtt -acctaatatggctggtaattatcaaagtaaggaccttaatgccatagcgctagcaatcgc -tttgtatactgaccatgtgccaacctctcttaatctgtaaaatataatgtcttagctaac -tgtggacgatcatgtctctgcctagagcttcgctgtatcaattcctatagccagcgtact -agtgacacaacaacaccgtgtgagaaaagatattagtccttacgtctgtctctctacagc -ttattgatgaggattgaacatggacatatagctccccctcaaaagcagatgctacctctt -tattccattctcgaacatttgccgaacttaatttcgacaaacctgaggtcacgtcttaat -ttatcggtaacgtcacgtccctttgagactggataaatatattaccaggggccaacgagc -aattgttggaggcgcttctataatacaaggtgtcttgtcaaagaaagacggcgtgcgtct -cgtgcaactcacttaaccaatattaatgtgaaacccccctctctcacatcttatgcggtg -tactgccctggtacatttcctgtacaggactccaacagtgtagattcctaagatagctgt -tggagttgcctcacgccagatcgaaaaactgaataaactagtgagctgagctgcagaaat -accgcttaattacttatgactagttcaaagggacctacgtgatgtcagacattgcaagga -agaaattaggtttgtgcgtcattttggctggactagcactccttacttcccctactattc -aaatgtcgtaaacagcatgagacaggatcgtgctgacatttaaggtctattgggaacgag -gctacctttggtcgcgcgctcgcgttctccgaatgaccgaaatgcatgagcacagtatgc -aattgcttatagatctaaggtctggtcgttgaaaccaagcacgtaggcctgggaaatcag -ttcttcctcagcaactacacaaaagcgtccaagcattagtacttgtagtaaatgtccgaa -cctatgcgctcatttgaaagtcaaaaaatatttttaagcagtaggcacctaacccgattc -ctctacttagtagctttctttgattctcagaattgactgcaatatcactgcacaattctg -tgccattactagacttctctgtattaacgtctcatcttactaacactcgcctaggacaca -tctgagagtgaagtatttcaatacatttactgaaatcttcagttctaaaatccccgaata -aggctcttatcggtttggccaacacaagaaaaaaacttcttgcaccactcaccttcatac -gcaggagcctggggaacttagtaataactatttcggcagacaaagcttataacaagttgc -cggcgcgtataatatttaaaagaccccttgagctgctcaattaaaacgctcacctggtat -aggctattagatagtgccgtcttagtaaggggcgggaattatcggataaactgatatttt -gataaaataaccgacttgttcacgacataagtcactaaggagattttatctttctccaaa -gtatatcttccttggataatttcaaagcgctgcaatttaagttctgttactagtttatgc -tgctgggaggtgaccggaaggcgtagtaatctagaggcaaattataagaagttcatcata -tcattttcgactacaaaaacaaggtgttgtatgccggcgcattgtgtaaactggacgagt -accctagatggaaaattatacgttaagccaagatttcgatgtaatgataattacctacac -atttttgctatccataggaacaagagctgttctataggctcgtggcatacgaacatttgc -tgccgctatgaatattggaagctcttcaactacagactctattcttaattgccgtcgaaa -atgggccgaatcggctattattaatactcggtttttccgaggggattgttgtcgacagtc -gtaattattattaatattgatgttggtgaggtcatttaaatacaaccttgcagacaatga -ataagggatccaatctctcatactccttttacaattgctcatgcccctatgcaaacctta -tgccgccacacctccgcaactctctcttctgaactgtaagtagcttcattactggtttga -gactatactgaagctgatgacattctaaaatggctattttcgaatgtgattcataatgtt -tatcgtttgggatggcagaatcacgttatttttgatatagcccgggtattctattgtata -gaacgtatgctacaagtcattccccgaagaagactagaagtaaacaacatgcgaccatcg -ttaagccacgcaaggctgtagctttatttcccgataacctatcttccataaatagcggac -agcaggatactgacgctcaacatcagtggttatggtctaatttttaacttttaataaggt -aacttcagcaggcatacacagtaactctttaatttataatcaaattagaagtctgacact -tcttatatttttctatcatccaacgcgatcgcccattagcttattgtgttactaataacg -tatctaaaccaatccttttcaagctactgcctatattgtcaatatatacaaacaacagga -tagtaggctgcttaaaaaatattgtcaaccgtgtacgctttacaatacccggaaatcaca -aactttgtagacaacgagtgaaatttatacactacgaagggccagcgtacaagacccatg -aattaggcgatatgtttattctgacatattggtttatccttaatctgtcgctgtaaaatg -aagccgcccccatccctgcgaattttttttcgaagattcacgactgaaatataaatacgt -ttggctatatttatgttggagggaggcaatagcctttactgttaaccgaagatttagcca -gtgagtgtgacactaaaacactggaataaatgcaggcgttcttctgggtaaaaggtttag -tcaatctcgcctataagttcatatagctctggatataattatctggcccatgcatttatc -atggcgcttggtgccctgtgtgaagccggcctctcatattgaaggtccgaagtattccat -gtacattaagatcactctctcattcatgcatcttggcttaacaaatctggttgtccaagc -tttccaggcacgtatggtacaaattcggatcgaatacttataaaaatgatatgttaaact -gtctaaaacgctcatctacaaagtaaagtgcactaaccaatagagtctcaagaccgtgta -atgctggtgcactgaatgtgtaatacggttagaagggattagttatgttacaaatccatt -gaaaacttaagaagcattgcgtgctcggagggtgcatcttttatcaagagactaacatta -ttttcaacgacgtacatgctttacaatagggtacttatcaaacgccgagaaacgcgccta -tagtgatgttatgattatgacccgatatccattggaccgaattttatgtaggttcccagc -gtactcgcgtaatatctcggtattgccataatgtaatacttgtcggtctctcccagatga -aaaagcgttacagagtatttcaatgaaaaacagcgcgcaacgtcaatacctttaggggta -acggccgctgatttcatatagatatacgataagttggtatagctctactaggtggcatcc -acaatcgttgcatttactatagctggttacaatcataatctataccgttccttacatact -accatagcgggatagcgtttttttgccgttgattgggtttaagaggatgtcagtctcatt -atatccgattcggtgggagagccgttgttttcaaatcgcacactttgtgacataatgtac -aagataacaaaactgatataagatataaactgtcaatatcaccttgacacttgaatcaaa -gtaaattaactcgcaaatataatttgactaattgggtgcagatttctcaattaataaaaa -aatggcaccggatgggcttacaagccccttatcattcacttgtatcatgatttccaagaa -caatagaatttgctagcaagtatgaacagagattcgaattgcatccacagtacgccggag -cgtttattttaatgtggatatgacgatgtactgttggcggcatttgctagtaaccggtcc -ttatttacgtagcgcacacgtaagcatgtctgggagaaatatggtggtacaatctcagag -aaagattacagtttggtttaaataggacttatcgggtcggaagtggaacttaataagcag -tacacaattgggcaacagacgtcttgcctattacaataggattacaatgcgttagatttc -agacacgttcgtgtttggctattcgtcaattccctaaatagttagacgatcaactattat -caaagtgattctttgttcatcctccattcatgtaacagatggcacactacgcataacgcc -gaggaattttaacgagatttaagagagcagttcgggcacaacccacttgactttataaca -gctcggcagcataaacggtaatatgtgacaaatttccaaacgttataagaacgtatgtgt -acttagaaaactaagtggttcatgttcaacagatgtgacgcagcaagcctaacttatcta -ttggttttgctataaaagaacaaagttacacagaatcctaagggcttgtttcacacttat -gcctagtgcttcaccatcttaaaatagcgaaaccggcacgaatcaaaccttaaaacaatg -cgcagatattggtgatggtgactccgggtatgataatggtaactgttgaccagcgcccac -ctcatcgaagtatagaaagtggttaggataaggatgagaccgaacttatttccggccata -actttagattttctacctagtacacaacatcagggcggacacgaaaccgccatcacatca -tataccaggtttaatttgcttaatgggggaagtgtcaacgaaccttcgaactttagcagg -catatggccattatatatggccccagagcagaatgctacagcagacaaaatttggattta -tgtagtttaatacctatcaaacttggtgtgaccatacttgtctaacgacagtgcacaaag -tgtaagttacaattattactactcagcagcttctgcaatgataaaatcttatcatacacg -tcacatatgataatatctacttagggggaacgggctccacaacctacatagtactcaata -cttacactattcgacaggcacaccaaacctgtacagtcccaaaagattgagtcaactttg -cagtactgcagatcacagtaatagcttagttagcgagtcaaaattagttttctacgagac -tgcacgaccgtgcaaatttccgatgtgttggctacaaatagcaacgtatgaatttgtttg -aagccacgtaaactgtacaaccttagagataagtctcaggctactaaaaacacgttgtgg -cactaacaggatcatggttgattcttacttattcggctgaccggcccaataagtaacctt -caactagaacagaataatcgggagtagtttaattcagtcaaggtgcaggtctcattgtaa -ctaacaagctctgtgtaaccaagttaaaatcgttttcttagcggattccctacttatgga -tttgagctcgtccacaatattcgatacaagaagtttgtggtccgtaacaacgaaatttta -attacgctgtgcagcctcatccaaggaattaatagaaggttgatggtaggctccgaacgc -tccatgattataatcaagtggactgtgcagtaaacgaggaaggtatcctgacgtcgtggt -gttcgtttttgttatttgtgccctatacgagtagataaaccatgaacagcacagtgtgaa -cccatggttgattttaggctaccttatttttaatttccgttacacagaaacgaattccac -aactaacatgccattaatttttcgatatcttataaaagatggtcgaaattcattcattta -ttttttttcggttctcgaaagtcaactaagctgtcgcgttttgtttctctttagaggtaa -aagtggctttgatctcctacgtttggatactagtcaaccattactccatttgatccgtga -gtatcacctgtctaacatccagcattatgactcctcggcgaagaaaagacacacttctta -gagtcgatgtgtattagctagggacacagttgtttaatacgatagtgagcccagggaggg -cagtgcgtcccccagtagatttattcagctagtgtaagtataagatatctcacccacgag -gttcaagtgatatgcagtcttagaataatacttatcctgaatttcgatattatgggtact -tcaataatccgctagcgctactttatgtctcgttggacagcaggacacatggcagtctta -aacactaaagacatcacctgaatgaatgtaatgggattacaagaatcaatgaggtattat -atacgacgtaggaaactctggatatatacagtaatctagttacgccatcgcacttcattc -ctctggaaacttagaagacatcagctgtacgtggaggaaccagacccccgtatgtagcca -aatagaaccaaagttgcttatacaaacacacccaatgacaatggaccgctggagttcgta -aactcggaacgtagtactgcacaaacccagcatttagcaataggagctacgtatgcaact -cccacgtggtaataccttcaagctatcaatatataggtgcctagctaatcgcattcgcaa -gcagtattcaagcttgtaaaccagtataataattacagaggctctatgaaacccaacttt -ccagctaaaagtcccaattaaatggttatttcgtacttttaaagtcgcccgttctgttat -tacgcgaattgattctactccaaaattaaacacaaattatcaaccgtttcatttatattt -gtcaatgcagctgtttaaaataaggctctactaaattataattaagacacttattaccag -atttctctagttaagtttgaaccagctcgactaccgcgaaagatacattcccttctctat -ttttcagttcatctatgggtcagagaagcattgaatttattctattcaccctcgtcgttc -acagcgaatcgtcagtgtgatcagtgtatgagaaatatcctaaaccgtttagtcagacca -cacgcttagaacaagtggtctaaaaagactgccctggaaggagtaagaagtatacagctg -atccggtgtatccttcagtcatctgccctatactaattacacgacgcaaggaaaaatagg -tttattttctaggcaaacccttcataggtgactccgatgtgttacgaatcatgcttgaga -atgtgctatcgttaccgacggataataacgatctccaatgaaccaaatgtagaatgtcta -ttgattacccttttactattcgacttagagataggagatagaacctcagtgtactttttt -agccgaatgggaatctttgggaggtgaatggccataaggtcgtaaatccaaccctcttaa -agtcttccatattatatcgttgttcgtggaatcgataacagatttgttgacccatagtaa -atgtatactagtttatgttgtaagtgtagattgttttccgattgccgtccaaactttatg -tcgtaattgtagaccagtaaagttgaccaaggtaagtgcccagcgatcctgcgagatcga -tcgccaatttttccagtcactgtaagtgtaggtttagataaagccgtatgagttatatca -taagggcctcggaaagcagcttcgaaccaaagttcccttataatagtagtttaactataa -aagtatatactggtctgtcgccctttcacgatttgttttaccggtttatgaagcgttacg -tcattagagcggctccaatttaaggttaacggcttccatgtgtagttgtatacaaggata -acttaaagtatctgttcagcgagctagttaagttatcctcgatagaacacaactcagagg -tcccaagatcgggtttgcaacttgctaatttattctcaaggcaaattgggaattatcgat -acctgtataccataaggtcgctcgatgtgatgcttatgtcttctggtgatcctaccttag -ttagtgctgattaacggaacattaatgtttatcgttttgagatttagccaattctctgat -tctaactcaagatgccttatctgacgtgctatgcagcccctaagtattttacattgtaat -aggacacgctcctttaaaactcgccaaaaggtcgttgtggttctctactggttaactata -taatttacagctttgttgagctagttcctctttggtttaagtcctcaatattagttggtt -cgagcgataagttggctagttaccttagtcactatattagatccgaatgttatgcttcat -ctgaagaccgccaccctccaaaatttcttttaagactcacttattgcaaggtgtaggtga -attcggctcgtttctcaagtggtgtatctgtacacgagtttccatattttcatcaacagc -caccgcacacttatgtcactctaggtattaaaagtcgctctacaaggggacgcaattaag -aaacagacatgctagtcaaaaataaacatagcgaggcaccactaattcggccgcttatca -atgggatgctctgcgcgagacgcgccagagctcagtagttagttcggacatacatttact -tcagatgatcaattagttttctacaaatgcttactctaccccgaaaaaagtcaccagact -cttacgtctctttagtatccttccgtcttatataaggtcagtcccccgtttcggtaccct -ggaatttactaagaataatgaaacagcccccaaggacgtacgtttacaaatgatagacca -gatcgcctagcttattccgacgcatgttgcatagaattgaaccaacggaatgtgagagta -actagatgagccgaccacagcacccgtttgcgtcgcagaatacgcctgatagttcggcca -cgaaatcatatgtcctttgagtattaagtatttgtaatgatcaatcgagctcaagcaagc -ttacacttcctcggatattcagggaacttagtgcctttgaaagatacgttgatcaacgaa -aaattgataatggctcatatggaatgcctacctcatagtgctgaattaacacagcactgc -ggacctaacttttcgaggtttcaagttcacgtctcaaaacctaataggctggaatatgta -gggatcctcggtgaatttgtgattgggtttgttgtagtactgaccaagtgaatattcttt -ttttctaaaagcagatctgctgccgggcactacgaaggagatctctgtgtatcattattg -cttcttgacatgatgactcttaaatcactgtgggtgtgcaaaacgatagcacaacccaat -tcgatagtacatattgttgatacttcgcactaaaccgttcatatttaaaggttgtgctcc -ttccttcgttaaatactggtgacttggtcctatctactattagctagacctctggggaac -cacgcccccgtaaaacctgtgcaagagagggggtcatacatcttagacatcgcgcctcca -ccagggaagcattgggtgattgaccaggtgtgtaacaaatatgattattcttatactaat -attagcaaagatgcataatgatttgtattaaatgtataattgaattgataagggtctttt -agtcagtgatagagtagtataaggtagacattagaactcttaaccggacgcagatttttc -ggtcttagtaagccaattagtcgacaaaacaaggtaagagcggttactagtagtacctat -aatgcactgaatcttcggtcgaagtatagttctaatgctatgcagattgtgacggcgaca -aatgttcagacttatatcatgaaacaagctcttgtaagtattgacaaatgaaaagattga -atatttttaaatacaaaatgcgcctacttattaggggaattaaccagattgaaggccaat -cctcacatgtaatgagataatagacgataaatgaaattcttgtaatagttgaactgctac -gtgatgggtattatatatgattgagatcctccaattgccgacgtcttgtcttgatgccca -aaagattgtcaacgaggagctccctcgcgtacctgtcgtccgtatcataaacgacgcgac -atgtacagcactccgaagtataagcaataataatgcgggtaatccagactagatcttttc -ggactcaatgcggtttcacggtaaacatgattaataccggagagtagtcgagcttatcag -cgatgcaagcgaattcattgtgccaggagatacgttgcagataaaaccggcaacgtatgt -caacaagttttggcgatctcgttgtttgtattcgacgaggcgcgggaacttcaagaacta -tcgtatattcaagtccattaccttttagtttcagactggtggagctgactaaagttatat -catcattttgtacactggtttagttaacgataatttcagatttaacatgaccagacgata -atcgctgtatatccagttggaatgtggtttgccagaaaggttaacttataatcaagcctc -tcttcagtcttgattcgtcgtatcccatccattgcgctatacctcagtgtatttggagct -gtagttataccgtgtgctaagatcagtagacatgacgagagcaatattatctaccttaca -agcatcaacggacgtctagtcggaacaaaagactctaaaactcgaacttcaggttaatat -actatagttctgtattcagcagttattcttatattcgatattatcttgcctattggatgt -ctgactttagtatattaatcatagtatctgccatgtaaaggtgccagtactaaatctgtt -tcacagtgcgaattataaacggttacaaccattaaagacaacaagaccctatagctttat -ttgaattttgtcaatgcgcaacttggagctcgcgatacatcccaattagtctatagggtc -gggacgattctacggcatttctggttataatgacaacatggattgtggcccgagaatcgc -tctttcattaattaagcaatcattacagtcttataagcgctacttccgagtggtagcagg -taactcgatataaggtcgcatgagccgaatagcttaaaaaacaggccaccgaacattgat -agagaataccgaccacagcgcaacctttgattactttcattaaattgtacggctcactcg -acatcaagcttaagattgcgataatgtgaactcaaatggatcagtactgaagaaccgtaa -cccacttcgcagaaagcgtacccagagaagatacgctgttacaatatacagggtgaaatt -attgcctgttcttcgtaaccatttcgccaaacttggttagaaatgatagccattcatgat -agaaataagctgaatgataccagtatctttaactatgtagtcagggggaagataacgatg -gtccatgtatgtttctgatatgtgacagtattggccgcgtaatttgctaacgaagctact -taatgcctttgagcttcatatagatttctttaatcaaaatcggcaaaaagatagtatgag -ctataatatatgctagtagagaactctggaccatcatctatatgaatactgattcgagcg -tgcaattactttagcctgcgtactactgactctacaaaacactctgagataagtttgtag -tcagtaagtcgctctctataaaccttttggatgaccattgtacagccacttatagatccc -aataaatagcacaggagacagagtttttcaatgctcgatcatttgccgatagtattttcg -tctaacctcagggcacctattatttgatacctaacctaacggccctttcacaatggagaa -atatatgacatcgggacaaacacaaatggtgggtggccaggagatatgacatggtggcgt -ctctaagaaacacggactccctctaggcaaactcacgtaaccaattttaatgtcaaacaa -aacgctcgaaaagattttgccgtgtaatgacctggtacattgactggtcaggaatacatc -actgtagttgccgtagtgtcctgttggtgttccatcaagacacatcgtataacgcaattt -acgacggacatcagatcaagttatacagattatttaagtatcacgtgtgcattgggacat -aagggatctcacacatgccttggaacatttttgctttgtgccgctttttcgctgcactac -caatccttacttaccagtatattcaaaggtcgttaacagaatgagaaaggttagggctct -aagttatcgtcgattgggatagacgagacatttgcgagcgccctccacggatacgaatct -cccatatcaatgtgaactggatgctatgcagtttagttcttacgtctcctagtggtaaaa -atcaaagtagcactcgcatagcagttattcagaacctaatacacaaaaccgtcaaacatt -ttctaattctaggtatgggccgatcataggagctaaggtgaaactcataaatgttttgtt -agatctagcatcctaaaaagatgcatatactgagtagctggcgtgcattctctcaattgt -atcctttttaactgaactagtcggtcccatttcgtgactgagatctattaaccgataaga -ttaataacactcgcattcgtatcagctcagagtgaagtttttcaataatttgactgatat -attaacttctaaaataaccctttaagcctcggatccgtttcccaatcacatcaaaaattc -ttattccaactatctacggattaacaacgtgcatggggatcgtagtaagaacttgttccg -atcactttgagtatatcaagttgacggcccggttattattgaatagaaacattcacctgc -taaattaaataccgcacatcggatacccgatttcagagggccgtcttactaagggcaggc -tttgttcggtttaactgagatgttcattattttacagtatgcttcaactaatatgtaacg -aaggacagtggatctgtctccatagtagatcttcagtcgtgaatttcataccgctcctat -ttaagttcgcgttcgagttgttgatcatggcacgtgaaagcaacccctagtattctagac -gaaaattttttctagttcatctgataatttgccaattcaaaaacaaccgctggtttcccg -gcgcattctctaaaatggaagtcgaacctagagccattatttgtcggtaacccatgagtt -ccttcttttcagaagttaatacactgtggtcctatacagaggaaaaacagcggttatata -cgatcgtggcataacaacattggatcaagatagcaatttggctacctattctaattctca -ctagattcggtattccactacaatatcggcagattaggattggatgaataatcggtgttt -aagtccggttgcgtctccaatctcctaatttttattaatattgatcttggtgacctattg -taaataaaaacttcaagactttgaataacggtgaaaagatagaagactcatttgaaaatg -gatcatccacagatccaaacattagcaagacactaatccccaactagctattctgatcgc -gatcgtgctgcagtactcctgtcacaatagtctgttcatgatctaattctttttgggctt -tgttcgatggtgattcagaatctttatccggtcgcttccctgtagctactttgtggggat -attgcccggggattatagggttgagatcgtttcctaaaagtatttaaaccaagtagactt -caactaaactacatcagaacatcgtgaagacaccatacgcggtacctttatttaccgata -acatttcttcaagaaataccggtaagcagcataatgaccctaaacagctcggggtatcgt -cgtagttttaaattttatttaggttactgctcaaggaataaaaactaactatttaattta -taataatattacaaggctcacactgattagatttgtctataagacttcgcgatcccccat -taccggattgtcttaagaataaactagataaaccatgcattttctagataaggcctttag -tctaattagatacaaaaaacacgatagttgcatccttaatttattgtgtcaaacctggaa -ccttttaattacccgcaaatcactttatgtcgagactacctctgaaatttattatctacc -taccgcatgaggacttgaaccatcttgtaggagttatgtttattagctaagattcgttta -tcctgtagcggtccatgtatattcaacaagcaaaaagcactcagaattgtttttagttga -gtcaagactgatatataaataagtttccctagttttttcgtggtgggacgatattgaatt -gaatcttaaccgaagagtttcccactctgtcgcacaataatacacgccaatatttccagc -cctgcttatgccttaatcggttactcaatctcccattgaagttcattttgatctgcatag -aagtttcgggcccagccttttttctgccaccttcctccaagctctgtagacgcactctaa -gattgatgctcacatgtattaattctacattaacataaatatataagtcatgcatcttcg -agtaaaatatctggttctccaacatgtcctggcacgtatcgttataatgcccatacatgt -agtattaaaatgattgggttaactggatattaagatcatcgaaattgtaaagtcaaatta -acaatactgtctcaagaccgtgtattcctcgtgctcggaagggctattacgcttacttcc -gttttggtatcttaatatgactttcaaaaattaagttgcagtgagtcctacctgcgtgca -tcggttagcaagagtataaaagttgtttaaacgaactacttgctttacaataccggtcgt -atatatcgccgtgaatccagaagattgtcttctttggattatcaaccgagatcctgtgga -ccgatgttttgggaccttcacagaggactccaggtagagctcgcttttgcattaatctaa -gaattgtacctctctaaaagatctaaaacagtgaatgtgtatttcatggaaaaacacaga -gaaacgtaaattactttaggccgaaaggcacatgagttattatacatatacgagatggtg -gtatacatcgaattcggggcatacactatagttgcattgtatttagctgctttaaataat -atgatattaccttccttacataagacattaccggcataccctggttttcaacttgtgggg -ctttttgacgatcgcactctcatttgatccgagtagggcggtgacccctgcttttcaaat -acaaaaatttcgctatgaaggtaatagattacttttcgctgttatgatagaaacggtaaa -tttaaaattgaaacttctagaaaagtaaagtaacgagaaatgattttgtgaataatgcgg -tcatgattgcgcaagtaagaaaaaaaggcaaaaggatgcgcggaatagaaacttatcagt -cacgggtatcttgatttcattcttcttgtcaattgccgacataggatgaaatcagattcc -aatgcaatacacagtaacccccacccttgattgtaatgtcgatttgaagttgtacgcgtc -gacgaagtggatagtatacgggccttttgtacggtgcgatcaactatgaatctcggcgag -ttagatggtcgtacaatctcacacatagaggtcacttgcctgtaatgacgaattttcggc -taggtactcgaactttattagaagtaaaaatgtgggcaaaagaaggattccattttacaa -gacgattacaatgagttacatgtctctcaacgtagtctttccctagtagtctttgaacta -tttaggtactccagaaaattttagcaaagggtttctgtgtgaatccgccattcatgttta -tgatggaacaataagaataacgccctcgtatgttatcgacagtgaagtcagcagttcggc -caaaaacatattcaatttagtacagatccccagaagttaagctaagtgctctaaaatggc -ctaaacggttatcaaagtaggtctaattactatactaacgggtgcatcgtaataactgct -gtcgatgcaacactatatgatagtgtcgttttgctatatatgtacaatgtgacaaagaag -ccttagcgattcttgcaaacttaggacttcggattctcaatcttaaatgtccgaaaacgc -aaagattcaaaaatttaatctatgagcagatatgcctgatggtgactacgcgtatgttaa -ggctaaatgttgacaaccgcacacataatcgaactattgatagtcgggagcataaccagg -tgaacgtactttgttcacgacatttattgacatgttctaaatacgtctcaaaatcacggc -gcactagaaaacgcaatcaaatcattgtcctggtttaagggccgtaatgccggtagtgtc -aaacttcatgagaactttagctggcttttggccagtatttagggaccaagagcactagcc -ttaagctgaatattttgccatttatctactgttataactttaaaacttggtggcaccaga -cttgtcgatacacacgcatcaatctgtaacgtaaaaggtttactaagaacaagcgtagga -attgagtttatattatatttaaactaaaagatgatattagcttctgagggcgatagggct -ccaaatcataaagaggaatatattattacacgattagaaacccacaacatacctcgaatc -gcccaaaagtttgacgaaacttggcagtactccacatctcagtaatacagttgggagagt -ctcaaatgttgttttattactcaatgaaccaccctcataatttcactgctgttccattaa -atttgcaaacgatcatttgctttgaagaaacgtaaaatcgacaaaattacagataagtag -atgcataataaaaaaaactgctcgctataacacgatcatcgtgcattcttacttaggagc -atcacccgcacaataacgtaccttaaactacaacactattagaccgagtactgtaattca -cgaaagctcaagctcgcattgtaaagaacttgctctctcgtaaaatgtgataatagtttg -cggagaggattcaattattttccattgcacctactccactagattcgataaaagaaggtg -gtcctcccttaaaaagaaatgttaagtaacatcggaaccataagcaaagcatgtaagtga -accgtcatccttccctaagaaacataaaggtttttaataatgtcgactgtgaactataac -tgcatcctttcctgacctactccggttccttgttgttatttctgaacgagaccagtagat -aaacaatgtaaaccacagtgggtaccaatggtgcatgtgacgctaccgttgttttaagtg -cccgtacaaacataagaagtcataatcttacttgaaattaattttgccttttattttttt -tcaggctcgaaattaatgatttgttttttttgaccttctagttacgctaatatgcggtcg -cctgtggtttctattgagtcctataacgggatgggatctaatacgtttggttactagtaa -acaaggtataaatttgataccggagtatcaactgtataacatcaagctttatgactcata -cgcgaagtaatgacacaaggctttcaggagatcgcgagtacagagccactaaggggtgta -ttacgatagtgacaccaccgagcgcactcactccccaagtagatttatgatcctacgcta -agtattagatatataaccaaagaggttctagtcagtgcaactcttagaataataattagc -cggttttgcctttttaggcctaatgcaatattcagctagcccttatgtatctcgcgttcc -acagcaccactcatggcacgcgtttaaactaatcaaatataatctatgaatgttatgcca -gtacttgaataaatcaggttttttataagtccttgcatactctcgttatatactgttaga -gtcttaccccatagaaattctttcatctgcaaacttagaagaattctcagctacggggag -cataaagtccccaggatgttgacaaatacaacaaatgtggcttatacaaacactccatat -gaaaatcgaaccctcgtggtagttttagccgaaccttgtacggataaatccctccatttt -ccaatagcagatacctatcctactacctcgtggtattaaattaaagcttgaaatatagag -ctgcatagcttatccaattcccaagcacgagtctaccgtcgtaaccacgatttgatttac -agacgctagagcaaacccatctttaaacatataagtaaaaattaaagggtgagtgcgtac -gtgtttactagcaacttcgcttattaagacaattgtttataagccataattaaaaacata -tgttcaacaggttcattgatatttgtaattgcacaggtttttaataaggatctacgtaag -tataatgaacaaactttttaccagagttatattctgtactttgaaaatgctcctctaccg -ccttagagactttcaattagattttttgcagttaatctatgcgtaagtgaaccatgcaag -ggatgcgattcaaccgcctcgtgctaaccctatcgtctgtctcataactgtaggtctaat -ataattttcagttttcgaacacataaccctttgaaaatctgctatttaatgtctcacctg -catgcactatcttctatactgctcagaacggctatacgtcactatgctccaagtgacgat -ttaaacgaagcaaggaataataggtttattttagtgcaaaacaattaagtgcggactacg -tgctctttacaataagccttgtgattgggctataggttaagtcccatattaacgatctcc -aatgtacaaaatcgacaatcgctttgcattacccggttactagtcgaattacagatagct -gttagatactcactctaattttggacaacaatcccaatcttggggtcgtctatcgcctga -agctcgtaaatccttccatcttaaacgattacatattatagacttgttcggggtagagat -atcacagttgtgcaaacattgtaaatcgatactagtttatgttggtagtctagttgcttt -taccattccccgaaaaacttgatctactatttcgacaacagtaaacttgaactaggtaag -tgaaaacagagaatgcctcatagtgccactatttgtccactatatgtaagtgtagcttta -cataatccactatgactgagatcattacggcctaggaaagcagcgtagaaaaaaagggcc -cggatattacgactgtaactataaaactagttactggtagcgcgccatgtatagatttgt -tttaccggttgtggttgcgttaacgaatttcagccgcgaaaattgatccgttaaccagtc -catctcgacttctataaaacgataaagtaaagttgatgttcagcctccttcttatggttg -catcgagagtacactactcagtgggaaatagatcggggttcctacttcagattgtattat -ctaggcaattgccgattgtgccatacctggataaaataagctacctacatgtgatgctta -tctattatcgtcatactaccttagggtgtcctgttgaacgctacattaatctttagccgt -ttgagatgttccaatggataggagtctaacgcatgatgaagtttaggaaggcagagcatc -ccactaagtatgtgacagtgtatttcgaaacgagacgttataaatagaaaaaaggtcctt -ctggttctattctgctgaactattgaatggaaagattggttgacctacgtactatttgct -tgaagtcatcaatttgacggggtgagagacatatggtgcatactttacggactctatatt -ttagatcagaagcttagcagtcttctctacaccccctcacgacataattgcttttaagaa -tctatgtttgattcctctacgggaattcggatccgttcgcatgtgcggtttatctaaacc -aggggacatatgttcagctaaagcatacgaacactttgctaactagacgtatgtatagta -gctataaatcccgacgatatttacaaaaagaaatgagactcaaatatatacatagcgacc -ctacacttattcgcaccctgatctaggcgatcctagcacccacacccgaaagtgagcact -agtgtcttccgtattaaatttactgcagttgagattttagttgtctactaaggattactc -taacccgtaataaggatcaagactcggtactagctttactatcattccctatgtgttttc -ctaactcacaagggtacgtaccagcctatgtaattacaataatgataaagacacaaagga -agtaactttacaaatgagtctccagttacactagcttagtccctcccatcttgctttgaa -gtctaaatacgcaatctctgaggatatacagcagaagaacactcataacgttggagtcca -agaattagactcatagggcccccaacatttaatatgtactgtgagtttgaaggtgttcta -ttgttaattcctgctcttgatacatgacacgtactccgtgtttaaggcttcggactgact -ttctttcataagttgagcaacgaaaatttcagaatcgataagttggattcactaactaat -acggctgattgaaaactccactccggacctatatggtcgacctttatacgtaaccgatat -aaaacttataggctggtatatcgagccttcctagcgcaatttcggatggggtttcttcta -ctactcaacaacggaatagtctttgtttagtaaaccagagctcaggacgcccaatacgta -ggagagcgctgtggagcatgtgtcattatggactggagcactcttaaatcactctgcgtg -tgctaaacgatagatcataacatgtcctgagtaaattttcttgatacgtcgcaatatacc -gttattagttaaacgttctcatccgtcatgcgtgaaatacggctgtcgtgctcagatata -ctattagcgactcatctcgcctaacacgcacacgtataaactcggaatgactgccgctct -tacatattagaaatacagactacaccacggaagcattgggtcattctcaaccgctgtata -aaagatgattagtcttataataagattaccaaagaggcagaatcatgggtagtaaatcta -ttattcaagtgattaccgtcgtgtaggcagggagtgaggacgagatggtactcaggacaa -atattaaccggacgaagtggtttacgtcgtactttcactattagtagtaaatacaaggta -acaccggggaatagtactaaatataatgatatctatcttcgggagaacgagtcgtctatt -gctttgaacattctcaaggcgtaaaatgtgctgacttatagcatgatacaaccgattgtt -acttttgtctattcaaaagattgaatagttttttatacaaaagccgcatacttatgacgg -ctagtatacagtttcatcccctagcatcaatgctatggacagtattgaacttataggaaa -ttcttctaatagggcaaatccgtcgtgatgcctattttttttcagtcacatcctcaaatg -gcactagtattgtcgggatcccattaacaggctcaaccacgagctcacgcgaggacatgt -agtccgtatctttaacgaagcgacagcgacagaactcccatggataaccaattataaggc -ccgtaatcctctagacatcgtttaccaataaatccgctttctccgtaatcatgttgaata -ccccagagtagtccagatgataaccgatgaaacacaagtctttctcaatgcacttacggt -gaacttattaccgccaacgtagctcatcaaggttgcgacatctagttgtgtgtttgcgac -gagcccagcgaacttcatcaactttcgtatattcaacgccttgtaattttactttaagac -gcctggtgatgtagattcttagataatcagtttgttatcggctgtactttaccataattt -cacaggtttcaggtcaagaagattatagctgtatatacagttccatgctcggtgcacaga -aacgtgatcggataataatcaatcgcttatgtcgtctttaggcgtatccaatacatgccc -cgataccgcagtgtatttcgacatgtaggtataccgtcgcatttgagctcgagtcaggac -gtcagctagattagattccttaatagaatataccgacctctagtccgaactaaactatag -ataacgccaacttcaggttaattgtctagtcgtctgtttgcagatgggattcttagatga -gtgagtatcggccatattggttcgagcactttagtttttgatgcataggatatgcaatgt -atagctgaaagtactttatctgtttcaaactcacattgattaaaccggtaaacctttaaa -gactacaagaaaatattcagtgagggcaattttgtcaatcacaatcttccagctagagat -acttcacaatttgtcttgaggctacgcaacattagacggattttcgcgttttattgaaat -aatcgaggggcccaagagtatccatagttcattttgtaagatttctttacaggcttatta -cagcttcttcagactcctacatgcttacgagttatatgctagcatgtgaacaatagatta -atatacaggaaaacgtacattgagagagatgaccctacacagcgcaaccgttgagtactt -tcattaaagggtaacgctctcgagacagcatccttaagatggccttattgtcaaatcatt -tgcagaagtacgcaagatccctaaccaacgtagaagaatccctacaaacacatgagacgc -ggtgaaaatagacagggtgttagtattcaatcttcggagtatcaatttcgccaatcttgg -tgagaaagcataccctttcttcagagaaagaagatcaatcataacactatctttaacgag -gtacgcacgcgcatcattacctgcctccatggatctttaggatagcggaaagtattggca -gcgtattgtgatttcgttcctactttatcaatttcacattcatatacatgtcttttatca -aaatcgccaataagataggatgagctatattagatgctagtagagttcgcgccaacatca -tcgataggaatactcaggacagcgtgataggacttttcaatccctaatactctctataat -tataactctctcttaagtttggaggcagtaacgcgctctatataatcagtttgctgcacc -attcttcagcctctgatacatacaaataaattccacagcagtaagagggtttaattgaga -catcttgggaacttaggattttactctaacatcaccgaaacgattattggataccgtacc -taaacgaactttctcaaggcagtaatataggacatccgcaataacacaaatgctgcctcc -ccaggagttatgtcttcctggaggctatatcttacacccactcactataggcaaactaaa -gtttaaatgttgattgtctaaaaaaaagatagataagagttggccggcgtagcacatgcg -aaagtgaatcgtaagctataattctctggacttgaagttctgtcctgttcctctgcaaga -aacaaacttcctttaaagctatttacgacgcacatctcagcaagttataaacatgttgga -agtttctagtcggaattcccaaagaacggatctatctaatgcattcctacatttttcctg -tctgccgatggtgccatcctattcaaagaatttcttaaaagtagattaaatgggactttt -aacaatgagtaaccttacgcctctaagggttcctcgagtgccatacaccagtcaggtccg -agccacatacacggagaacattctaacatagcattctcaactcgatcatttgcaggttac -ttctttcctatcctagtgctaaaaatcatacttgcaatcccatagcacggattaagaacc -taagaaacaattcagtaaaacatgttcgaattcttggtatgggaacatcattgcagctat -ggtctaacgcattaatgtttgggtacatcttccatcatataaacaggaagagtctgacga -cagggagtgcttgcgatcatgtctatcattgtgaaatcaaattgtagctcacatgtcgtc -tatgagagcgtgtatccgataagatttagaaaaatagaagtcgtataagatctcactgaa -cttttgaatgaatgtgaagcatatatgatctgctttaataaaactttatccataggatac -gtttccaaatcaattcaataattattagtcaaaatagataaggatgaacaacctgaaggc -cgatcggacgtagaaagtggtcccatcactttgagttgatattgttgaaccacacgttat -tatggttttcaaacagtctcaggatattgtatatacagataatccgataccagttgtctg -acgcccctcttacgtaccccaccctttgtgacgtttaaagcagttgttcagtattttaaa -ctaggcggcaactaatttggaaagaagcacagtggatatgtctaaattcttgttattcag -gcctgaatttaatacaccgcatagttaacttcgcggtagagttgttcatcatgcctcctc -taagctaccacttctatgatacaccaatagttgttctacggaatctgataattggccaag -tcataaacttccgctgcgttcaacccccttgctcgaatatccaactcgaaaagacagcct -tttggtgtccggaacaaatcagttacttcttttctgatgttaattctctgtggtcagata -cagaccaaaaactccgcggatttaccatcctccaagaacaaatttgcatcaacatagcat -tttggctacatattctaagtctcaatagtttaggttttcaactacattatcccaacatta -ggattggaggaataatagctgggtaagtccccttgcgtctacaatcgactattttttatg -aatatgcttctgccgcacctatggttattaaaaaagtcatgactttgaagaaccctgaaa -agatagatgaatcaggtgtaatggcagcagccaaagagcatataattagcaacactctaa -gaacattatagatatgatgatagcgatcgtcatgatgttatccggtcacaatagtagctt -catcagctaattcgttttgccagtggtgacttgcgctggaagaatcgttatacggtccct -tccctcttgatacggtgggggcttattcaaccgcgtggattgggttgtcatacttgcatt -aaacgatgtaaaccatctagtagtcaactatactaaatcacaaaatagtgatcaatacat -acccgcttcatggttttaaccatttaattgattaaagatattccgctaagaaccattatc -tacctaaactgatcgccgtatcctagtagtttgaaatttgatgtaccgtaatgatcaacg -aagtaaaacgttatattgtatgtagaataataggtcttggagctaaatgatgtgattggt -agtgaagacttacccttacaactttaccggtttctcggaagaatatactagagaatcaat -gcatgggctacataagcactttagtctaatgagataaaaaatacacgagtcttccatcat -gaattttttgtcgaaaaactcgaacctggtaatttaaaccatatatctttatgtcgtcaa -taactctcatatgttttatataacttcccaatcacgacttgtaactgcttgttcgactga -gctgtttgagctatgaggccgggatccggttgagctacatctatttgctacaagaaaaat -gaaagcacatttgttgggagttctggctacactcatagagaaataagtggcccgagtggg -tgcggcctgcctccatattcaagtgtatcttaaaccaagtggttccaacgctcgcgctaa -agaattaaagcctttatttcctccacggagtagcccgtaatccggttcgaaagagaccat -tgaagttaattttcatatccagtgaagtttaggcacaagcatgtgttctgccacatgcct -caaagcgctcttcaaccaagatatgattcatcctaacttcgatgaatgcgtctgtaacat -aaatatagaaggaatgattcggcgagttaattttcgccttctccaacatggcatccctac -gttcgttataaggaccatacatgtaggttttaaaggtttgcggttaatcgatatttacat -catagaaattctatagtcaaatttacaagactctagatactcactcgttgcagccggcta -ggaagcgctttgtaccttacttcccttttcgttgcgtaatatgaatttcatatagtaagt -tcaaggcactcatacctccgtgaagagggtagatagactattaaagttgtttaatagtac -gtattgatggaaatgacccgtaggagatttaccactcaatccacaagattcgctgctgtg -cattatcaaaacagtgcatgtcgaaacatgggttgggtccttcaaacacgaatccaggta -gagatacctttgcaattttt diff --git a/examples/regexdna-output.txt b/examples/regexdna-output.txt deleted file mode 100644 index d36baa5be8..0000000000 --- a/examples/regexdna-output.txt +++ /dev/null @@ -1,13 +0,0 @@ -agggtaaa|tttaccct 0 -[cgt]gggtaaa|tttaccc[acg] 3 -a[act]ggtaaa|tttacc[agt]t 9 -ag[act]gtaaa|tttac[agt]ct 8 -agg[act]taaa|ttta[agt]cct 10 -aggg[acg]aaa|ttt[cgt]ccct 3 -agggt[cgt]aa|tt[acg]accct 4 -agggta[cgt]a|t[acg]taccct 3 -agggtaa[cgt]|[acg]ttaccct 5 - -101745 -100000 -133640 diff --git a/examples/shootout-regex-dna-bytes.rs b/examples/shootout-regex-dna-bytes.rs deleted file mode 100644 index 773fd9ba8d..0000000000 --- a/examples/shootout-regex-dna-bytes.rs +++ /dev/null @@ -1,68 +0,0 @@ -// The Computer Language Benchmarks Game -// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ -// -// contributed by the Rust Project Developers -// contributed by TeXitoi -// contributed by BurntSushi - -use std::io::{self, Read}; -use std::sync::Arc; -use std::thread; - -macro_rules! regex { - ($re:expr) => { - ::regex::bytes::Regex::new($re).unwrap() - }; -} - -fn main() { - let mut seq = Vec::with_capacity(51 * (1 << 20)); - io::stdin().read_to_end(&mut seq).unwrap(); - let ilen = seq.len(); - - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, &b""[..]).into_owned(); - let clen = seq.len(); - let seq_arc = Arc::new(seq.clone()); - - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), - ]; - let mut counts = vec![]; - for variant in variants { - let seq = seq_arc.clone(); - let restr = variant.to_string(); - let future = thread::spawn(move || variant.find_iter(&seq).count()); - counts.push((restr, future)); - } - - let substs = vec![ - (regex!("B"), &b"(c|g|t)"[..]), - (regex!("D"), &b"(a|g|t)"[..]), - (regex!("H"), &b"(a|c|t)"[..]), - (regex!("K"), &b"(g|t)"[..]), - (regex!("M"), &b"(a|c)"[..]), - (regex!("N"), &b"(a|c|g|t)"[..]), - (regex!("R"), &b"(a|g)"[..]), - (regex!("S"), &b"(c|g)"[..]), - (regex!("V"), &b"(a|c|g)"[..]), - (regex!("W"), &b"(a|t)"[..]), - (regex!("Y"), &b"(c|t)"[..]), - ]; - let mut seq = seq; - for (re, replacement) in substs { - seq = re.replace_all(&seq, replacement).into_owned(); - } - - for (variant, count) in counts { - println!("{} {}", variant, count.join().unwrap()); - } - println!("\n{}\n{}\n{}", ilen, clen, seq.len()); -} diff --git a/examples/shootout-regex-dna-cheat.rs b/examples/shootout-regex-dna-cheat.rs deleted file mode 100644 index 1bde7ab1ff..0000000000 --- a/examples/shootout-regex-dna-cheat.rs +++ /dev/null @@ -1,90 +0,0 @@ -// The Computer Language Benchmarks Game -// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ -// -// contributed by the Rust Project Developers -// contributed by TeXitoi -// contributed by BurntSushi - -// This technically solves the problem posed in the `regex-dna` benchmark, but -// it cheats by combining all of the replacements into a single regex and -// replacing them with a single linear scan. i.e., it re-implements -// `replace_all`. As a result, this is around 25% faster. ---AG - -use std::io::{self, Read}; -use std::sync::Arc; -use std::thread; - -macro_rules! regex { - ($re:expr) => { - ::regex::Regex::new($re).unwrap() - }; -} - -fn main() { - let mut seq = String::with_capacity(50 * (1 << 20)); - io::stdin().read_to_string(&mut seq).unwrap(); - let ilen = seq.len(); - - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); - let clen = seq.len(); - let seq_arc = Arc::new(seq.clone()); - - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), - ]; - let mut counts = vec![]; - for variant in variants { - let seq = seq_arc.clone(); - let restr = variant.to_string(); - let future = thread::spawn(move || variant.find_iter(&seq).count()); - counts.push((restr, future)); - } - - let substs = vec![ - (b'B', "(c|g|t)"), - (b'D', "(a|g|t)"), - (b'H', "(a|c|t)"), - (b'K', "(g|t)"), - (b'M', "(a|c)"), - (b'N', "(a|c|g|t)"), - (b'R', "(a|g)"), - (b'S', "(c|g)"), - (b'V', "(a|c|g)"), - (b'W', "(a|t)"), - (b'Y', "(c|t)"), - ]; // combined into one regex in `replace_all` - let seq = replace_all(&seq, substs); - - for (variant, count) in counts { - println!("{} {}", variant, count.join().unwrap()); - } - println!("\n{}\n{}\n{}", ilen, clen, seq.len()); -} - -fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { - let mut replacements = vec![""; 256]; - let mut alternates = vec![]; - for (re, replacement) in substs { - replacements[re as usize] = replacement; - alternates.push((re as char).to_string()); - } - - let re = regex!(&alternates.join("|")); - let mut new = String::with_capacity(text.len()); - let mut last_match = 0; - for m in re.find_iter(text) { - new.push_str(&text[last_match..m.start()]); - new.push_str(replacements[text.as_bytes()[m.start()] as usize]); - last_match = m.end(); - } - new.push_str(&text[last_match..]); - new -} diff --git a/examples/shootout-regex-dna-single-cheat.rs b/examples/shootout-regex-dna-single-cheat.rs deleted file mode 100644 index 70a979c6d4..0000000000 --- a/examples/shootout-regex-dna-single-cheat.rs +++ /dev/null @@ -1,75 +0,0 @@ -// The Computer Language Benchmarks Game -// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ -// -// contributed by the Rust Project Developers -// contributed by TeXitoi -// contributed by BurntSushi - -use std::io::{self, Read}; - -macro_rules! regex { - ($re:expr) => { - ::regex::Regex::new($re).unwrap() - }; -} - -fn main() { - let mut seq = String::with_capacity(50 * (1 << 20)); - io::stdin().read_to_string(&mut seq).unwrap(); - let ilen = seq.len(); - - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); - let clen = seq.len(); - - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), - ]; - for re in variants { - println!("{} {}", re.to_string(), re.find_iter(&seq).count()); - } - - let substs = vec![ - (b'B', "(c|g|t)"), - (b'D', "(a|g|t)"), - (b'H', "(a|c|t)"), - (b'K', "(g|t)"), - (b'M', "(a|c)"), - (b'N', "(a|c|g|t)"), - (b'R', "(a|g)"), - (b'S', "(c|g)"), - (b'V', "(a|c|g)"), - (b'W', "(a|t)"), - (b'Y', "(c|t)"), - ]; // combined into one regex in `replace_all` - let seq = replace_all(&seq, substs); - - println!("\n{}\n{}\n{}", ilen, clen, seq.len()); -} - -fn replace_all(text: &str, substs: Vec<(u8, &str)>) -> String { - let mut replacements = vec![""; 256]; - let mut alternates = vec![]; - for (re, replacement) in substs { - replacements[re as usize] = replacement; - alternates.push((re as char).to_string()); - } - - let re = regex!(&alternates.join("|")); - let mut new = String::with_capacity(text.len()); - let mut last_match = 0; - for m in re.find_iter(text) { - new.push_str(&text[last_match..m.start()]); - new.push_str(replacements[text.as_bytes()[m.start()] as usize]); - last_match = m.end(); - } - new.push_str(&text[last_match..]); - new -} diff --git a/examples/shootout-regex-dna-single.rs b/examples/shootout-regex-dna-single.rs deleted file mode 100644 index b474059600..0000000000 --- a/examples/shootout-regex-dna-single.rs +++ /dev/null @@ -1,57 +0,0 @@ -// The Computer Language Benchmarks Game -// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ -// -// contributed by the Rust Project Developers -// contributed by TeXitoi -// contributed by BurntSushi - -use std::io::{self, Read}; - -macro_rules! regex { - ($re:expr) => { - ::regex::Regex::new($re).unwrap() - }; -} - -fn main() { - let mut seq = String::with_capacity(50 * (1 << 20)); - io::stdin().read_to_string(&mut seq).unwrap(); - let ilen = seq.len(); - - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); - let clen = seq.len(); - - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), - ]; - for re in variants { - println!("{} {}", re.to_string(), re.find_iter(&seq).count()); - } - - let substs = vec![ - (regex!("B"), "(c|g|t)"), - (regex!("D"), "(a|g|t)"), - (regex!("H"), "(a|c|t)"), - (regex!("K"), "(g|t)"), - (regex!("M"), "(a|c)"), - (regex!("N"), "(a|c|g|t)"), - (regex!("R"), "(a|g)"), - (regex!("S"), "(c|g)"), - (regex!("V"), "(a|c|g)"), - (regex!("W"), "(a|t)"), - (regex!("Y"), "(c|t)"), - ]; - let mut seq = seq; - for (re, replacement) in substs { - seq = re.replace_all(&seq, replacement).into_owned(); - } - println!("\n{}\n{}\n{}", ilen, clen, seq.len()); -} diff --git a/examples/shootout-regex-dna.rs b/examples/shootout-regex-dna.rs deleted file mode 100644 index b96518e4c4..0000000000 --- a/examples/shootout-regex-dna.rs +++ /dev/null @@ -1,68 +0,0 @@ -// The Computer Language Benchmarks Game -// https://benchmarksgame-team.pages.debian.net/benchmarksgame/ -// -// contributed by the Rust Project Developers -// contributed by TeXitoi -// contributed by BurntSushi - -use std::io::{self, Read}; -use std::sync::Arc; -use std::thread; - -macro_rules! regex { - ($re:expr) => { - ::regex::Regex::new($re).unwrap() - }; -} - -fn main() { - let mut seq = String::with_capacity(51 * (1 << 20)); - io::stdin().read_to_string(&mut seq).unwrap(); - let ilen = seq.len(); - - seq = regex!(">[^\n]*\n|\n").replace_all(&seq, "").into_owned(); - let clen = seq.len(); - let seq_arc = Arc::new(seq.clone()); - - let variants = vec![ - regex!("agggtaaa|tttaccct"), - regex!("[cgt]gggtaaa|tttaccc[acg]"), - regex!("a[act]ggtaaa|tttacc[agt]t"), - regex!("ag[act]gtaaa|tttac[agt]ct"), - regex!("agg[act]taaa|ttta[agt]cct"), - regex!("aggg[acg]aaa|ttt[cgt]ccct"), - regex!("agggt[cgt]aa|tt[acg]accct"), - regex!("agggta[cgt]a|t[acg]taccct"), - regex!("agggtaa[cgt]|[acg]ttaccct"), - ]; - let mut counts = vec![]; - for variant in variants { - let seq = seq_arc.clone(); - let restr = variant.to_string(); - let future = thread::spawn(move || variant.find_iter(&seq).count()); - counts.push((restr, future)); - } - - let substs = vec![ - (regex!("B"), "(c|g|t)"), - (regex!("D"), "(a|g|t)"), - (regex!("H"), "(a|c|t)"), - (regex!("K"), "(g|t)"), - (regex!("M"), "(a|c)"), - (regex!("N"), "(a|c|g|t)"), - (regex!("R"), "(a|g)"), - (regex!("S"), "(c|g)"), - (regex!("V"), "(a|c|g)"), - (regex!("W"), "(a|t)"), - (regex!("Y"), "(c|t)"), - ]; - let mut seq = seq; - for (re, replacement) in substs { - seq = re.replace_all(&seq, replacement).into_owned(); - } - - for (variant, count) in counts { - println!("{} {}", variant, count.join().unwrap()); - } - println!("\n{}\n{}\n{}", ilen, clen, seq.len()); -} diff --git a/fuzz/fuzz_targets/ast_fuzz_match.rs b/fuzz/fuzz_targets/ast_fuzz_match.rs index 593ab193ef..58a8ebbf80 100644 --- a/fuzz/fuzz_targets/ast_fuzz_match.rs +++ b/fuzz/fuzz_targets/ast_fuzz_match.rs @@ -1,7 +1,9 @@ #![no_main] use { - libfuzzer_sys::fuzz_target, regex::RegexBuilder, regex_syntax::ast::Ast, + libfuzzer_sys::{fuzz_target, Corpus}, + regex::RegexBuilder, + regex_syntax::ast::Ast, }; #[derive(Eq, PartialEq, arbitrary::Arbitrary)] @@ -19,14 +21,15 @@ impl std::fmt::Debug for FuzzData { } } -fuzz_target!(|data: FuzzData| { +fuzz_target!(|data: FuzzData| -> Corpus { let _ = env_logger::try_init(); let pattern = format!("{}", data.ast); let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else { - return + return Corpus::Reject; }; re.is_match(&data.haystack); re.find(&data.haystack); re.captures(&data.haystack).map_or(0, |c| c.len()); + Corpus::Keep }); diff --git a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs index 8e7c879a51..a4fa0bd737 100644 --- a/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs +++ b/fuzz/fuzz_targets/ast_fuzz_match_bytes.rs @@ -1,7 +1,8 @@ #![no_main] use { - libfuzzer_sys::fuzz_target, regex::bytes::RegexBuilder, + libfuzzer_sys::{fuzz_target, Corpus}, + regex::bytes::RegexBuilder, regex_syntax::ast::Ast, }; @@ -20,14 +21,15 @@ impl std::fmt::Debug for FuzzData { } } -fuzz_target!(|data: FuzzData| { +fuzz_target!(|data: FuzzData| -> Corpus { let _ = env_logger::try_init(); let pattern = format!("{}", data.ast); let Ok(re) = RegexBuilder::new(&pattern).size_limit(1<<20).build() else { - return + return Corpus::Reject; }; re.is_match(&data.haystack); re.find(&data.haystack); re.captures(&data.haystack).map_or(0, |c| c.len()); + Corpus::Keep }); diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs index 9acf3fbd0b..07c2847313 100644 --- a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs +++ b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_dense_dfa.rs @@ -1,9 +1,9 @@ #![no_main] -use libfuzzer_sys::fuzz_target; +use libfuzzer_sys::{fuzz_target, Corpus}; -fuzz_target!(|data: &[u8]| { - let _ = run(data); +fuzz_target!(|data: &[u8]| -> Corpus { + run(data).map_or(Corpus::Reject, |_| Corpus::Keep) }); fn run(given_data: &[u8]) -> Option<()> { diff --git a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs index c0470acd67..0bd15f74d9 100644 --- a/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs +++ b/fuzz/fuzz_targets/fuzz_regex_automata_deserialize_sparse_dfa.rs @@ -1,9 +1,9 @@ #![no_main] -use libfuzzer_sys::fuzz_target; +use libfuzzer_sys::{fuzz_target, Corpus}; -fuzz_target!(|data: &[u8]| { - let _ = run(data); +fuzz_target!(|data: &[u8]| -> Corpus { + run(data).map_or(Corpus::Reject, |_| Corpus::Keep) }); fn run(given_data: &[u8]) -> Option<()> { diff --git a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs index 5786d13247..579078c71e 100644 --- a/fuzz/fuzz_targets/fuzz_regex_lite_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_lite_match.rs @@ -1,6 +1,6 @@ #![no_main] -use libfuzzer_sys::{arbitrary, fuzz_target}; +use libfuzzer_sys::{arbitrary, fuzz_target, Corpus}; #[derive(arbitrary::Arbitrary)] struct FuzzCase<'a> { @@ -47,7 +47,7 @@ re.is_match({haystack:?}); } } -fuzz_target!(|case: FuzzCase| { +fuzz_target!(|case: FuzzCase| -> Corpus { let _ = env_logger::try_init(); let Ok(re) = regex_lite::RegexBuilder::new(case.pattern) @@ -58,6 +58,7 @@ fuzz_target!(|case: FuzzCase| { .swap_greed(case.swap_greed) .ignore_whitespace(case.ignore_whitespace) .size_limit(1<<20) - .build() else { return }; + .build() else { return Corpus::Reject }; re.is_match(case.haystack); + Corpus::Keep }); diff --git a/fuzz/fuzz_targets/fuzz_regex_match.rs b/fuzz/fuzz_targets/fuzz_regex_match.rs index 073776776c..ae4d8a2d44 100644 --- a/fuzz/fuzz_targets/fuzz_regex_match.rs +++ b/fuzz/fuzz_targets/fuzz_regex_match.rs @@ -1,6 +1,6 @@ #![no_main] -use libfuzzer_sys::{arbitrary, fuzz_target}; +use libfuzzer_sys::{arbitrary, fuzz_target, Corpus}; #[derive(arbitrary::Arbitrary)] struct FuzzCase<'a> { @@ -43,6 +43,7 @@ let Ok(re) = regex::RegexBuilder::new({pattern:?}) .ignore_whitespace({ignore_whitespace:?}) .unicode({unicode:?}) .octal({octal:?}) + .size_limit(1<<20) .build() else {{ return }}; re.is_match({haystack:?}); "# @@ -50,7 +51,7 @@ re.is_match({haystack:?}); } } -fuzz_target!(|case: FuzzCase| { +fuzz_target!(|case: FuzzCase| -> Corpus { let _ = env_logger::try_init(); let Ok(re) = regex::RegexBuilder::new(case.pattern) @@ -62,6 +63,7 @@ fuzz_target!(|case: FuzzCase| { .unicode(case.unicode) .octal(case.octal) .size_limit(1<<20) - .build() else { return }; + .build() else { return Corpus::Reject }; re.is_match(case.haystack); + Corpus::Keep }); diff --git a/regex-automata/Cargo.toml b/regex-automata/Cargo.toml index 6b747a042d..8c1931c668 100644 --- a/regex-automata/Cargo.toml +++ b/regex-automata/Cargo.toml @@ -15,12 +15,14 @@ autoexamples = false [lib] bench = false +# This crate has many many many features. See the crate docs for a description +# of each and when you might want to use them. [features] default = ["std", "syntax", "perf", "unicode", "meta", "nfa", "dfa", "hybrid"] std = ["regex-syntax?/std", "memchr?/std", "aho-corasick?/std", "alloc"] alloc = [] - logging = ["dep:log", "aho-corasick?/logging"] + syntax = ["dep:regex-syntax", "alloc"] meta = ["syntax", "nfa-pikevm"] @@ -88,6 +90,7 @@ regex-syntax = { path = "../regex-syntax", version = "0.7.0", optional = true, d [dev-dependencies] anyhow = "1.0.69" bstr = { version = "1.3.0", default-features = false, features = ["std"] } +doc-comment = "0.3.3" quickcheck = { version = "1.0.3", default-features = false } regex-test = { path = "../regex-test", version = "0.1.0" } diff --git a/regex-automata/README.md b/regex-automata/README.md index ff4fe094c3..c12b07012f 100644 --- a/regex-automata/README.md +++ b/regex-automata/README.md @@ -1,97 +1,117 @@ -regex-syntax -============ -This crate provides a robust regular expression parser. +regex-automata +============== +This crate exposes a variety of regex engines used by the `regex` crate. +It provides a vast, sprawling and "expert" level API to each regex engine. +The regex engines provided by this crate focus heavily on finite automata +implementations and specifically guarantee worst case `O(m * n)` time +complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.) [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) -[![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) -[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) +[![Crates.io](https://img.shields.io/crates/v/regex-automata.svg)](https://crates.io/crates/regex-automata) ### Documentation -https://docs.rs/regex-syntax - - -### Overview - -There are two primary types exported by this crate: `Ast` and `Hir`. The former -is a faithful abstract syntax of a regular expression, and can convert regular -expressions back to their concrete syntax while mostly preserving its original -form. The latter type is a high level intermediate representation of a regular -expression that is amenable to analysis and compilation into byte codes or -automata. An `Hir` achieves this by drastically simplifying the syntactic -structure of the regular expression. While an `Hir` can be converted back to -its equivalent concrete syntax, the result is unlikely to resemble the original -concrete syntax that produced the `Hir`. +https://docs.rs/regex-automata ### Example -This example shows how to parse a pattern string into its HIR: +This example shows how to search for matches of multiple regexes, where each +regex uses the same capture group names to parse different key-value formats. ```rust -use regex_syntax::{hir::Hir, parse}; - -let hir = parse("a|b").unwrap(); -assert_eq!(hir, Hir::alternation(vec![ - Hir::literal("a".as_bytes()), - Hir::literal("b".as_bytes()), -])); +use regex_automata::{meta::Regex, PatternID}; + +let re = Regex::new_many(&[ + r#"(?m)^(?[[:word:]]+)=(?[[:word:]]+)$"#, + r#"(?m)^(?[[:word:]]+)="(?[^"]+)"$"#, + r#"(?m)^(?[[:word:]]+)='(?[^']+)'$"#, + r#"(?m)^(?[[:word:]]+):\s*(?[[:word:]]+)$"#, +]).unwrap(); +let hay = r#" +best_album="Blow Your Face Out" +best_quote='"then as it was, then again it will be"' +best_year=1973 +best_simpsons_episode: HOMR +"#; +let mut kvs = vec![]; +for caps in re.captures_iter(hay) { + // N.B. One could use capture indices '1' and '2' here + // as well. Capture indices are local to each pattern. + // (Just like names are.) + let key = &hay[caps.get_group_by_name("key").unwrap()]; + let val = &hay[caps.get_group_by_name("val").unwrap()]; + kvs.push((key, val)); +} +assert_eq!(kvs, vec![ + ("best_album", "Blow Your Face Out"), + ("best_quote", "\"then as it was, then again it will be\""), + ("best_year", "1973"), + ("best_simpsons_episode", "HOMR"), +]); ``` ### Safety -This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's -possible this crate could use `unsafe` code in the future, the standard -for doing so is extremely high. In general, most code in this crate is not -performance critical, since it tends to be dwarfed by the time it takes to -compile a regular expression into an automaton. Therefore, there is little need -for extreme optimization, and therefore, use of `unsafe`. - -The standard for using `unsafe` in this crate is extremely high because this -crate is intended to be reasonably safe to use with user supplied regular -expressions. Therefore, while there may be bugs in the regex parser itself, -they should _never_ result in memory unsafety unless there is either a bug -in the compiler or the standard library. (Since `regex-syntax` has zero -dependencies.) - - -### Crate features - -By default, this crate bundles a fairly large amount of Unicode data tables -(a source size of ~750KB). Because of their large size, one can disable some -or all of these data tables. If a regular expression attempts to use Unicode -data that is not available, then an error will occur when translating the `Ast` -to the `Hir`. - -The full set of features one can disable are -[in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features). - - -### Testing - -Simply running `cargo test` will give you very good coverage. However, because -of the large number of features exposed by this crate, a `test` script is -included in this directory which will test several feature combinations. This -is the same script that is run in CI. +**I welcome audits of `unsafe` code.** + +This crate tries to be extremely conservative in its use of `unsafe`, but does +use it in a few spots. In general, I am very open to removing uses of `unsafe` +if it doesn't result in measurable performance regressions and doesn't result +in significantly more complex code. + +Below is an outline of how `unsafe` is used in this crate. + +* `util::pool::Pool` makes use of `unsafe` to implement a fast path for +accessing an element of the pool. The fast path applies to the first thread +that uses the pool. In effect, the fast path is fast because it avoid a mutex +lock. `unsafe` is also used in the no-std version of `Pool` to implement a spin +lock for synchronization. +* `util::lazy::Lazy` uses `unsafe` to implement a variant of +`once_cell::sync::Lazy` that works in no-std environments. A no-std no-alloc +implementation is also provided that requires use of `unsafe`. +* The `dfa` module makes extensive use of `unsafe` to support zero-copy +deserialization of DFAs. The high level problem is that you need to get from +`&[u8]` to the internal representation of a DFA without doing any copies. +This is required for support in no-std no-alloc environments. It also makes +deserialization extremely cheap. +* The `dfa` and `hybrid` modules use `unsafe` to explicitly elide bounds checks +in the core search loops. This makes the codegen tighter and typically leads to +consistent 5-10% performance improvements on some workloads. + +In general, the above reflect the only uses of `unsafe` throughout the entire +`regex` crate. At present, there are no plans to meaningfully expand the use +of `unsafe`. With that said, one thing folks have been asking for is cheap +deserialization of a `regex::Regex`. My sense is that this feature will require +a lot more `unsafe` in places to support zero-copy deserialization. It is +unclear at this point whether this will be pursued. ### Motivation -The primary purpose of this crate is to provide the parser used by `regex`. -Specifically, this crate is treated as an implementation detail of the `regex`, -and is primarily developed for the needs of `regex`. - -Since this crate is an implementation detail of `regex`, it may experience -breaking change releases at a different cadence from `regex`. This is only -possible because this crate is _not_ a public dependency of `regex`. - -Another consequence of this de-coupling is that there is no direct way to -compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must -first convert the `Hir` to a string (via its `std::fmt::Display`) and then -compile that via `Regex::new`. While this does repeat some work, compilation -typically takes much longer than parsing. - -Stated differently, the coupling between `regex` and `regex-syntax` exists only -at the level of the concrete syntax. +I started out building this crate because I wanted to re-work the `regex` +crate internals to make it more amenable to optimizations. It turns out that +there are a lot of different ways to build regex engines and even more ways to +compose them. Moreover, heuristic literal optimizations are often tricky to +get correct, but the fruit they bear is attractive. All of these things were +difficult to expand upon without risking the introduction of more bugs. So I +decided to tear things down and start fresh. + +In the course of doing so, I ended up designing strong boundaries between each +component so that each component could be reasoned and tested independently. +This also made it somewhat natural to expose the components as a library unto +itself. Namely, folks have been asking for more capabilities in the regex +crate for a long time, but these capabilities usually come with additional API +complexity that I didn't want to introduce in the `regex` crate proper. But +exposing them in an "expert" level crate like `regex-automata` seemed quite +fine. + +In the end, I do still somewhat consider this crate an experiment. It is +unclear whether the strong boundaries between components will be an impediment +to ongoing development or not. De-coupling tends to lead to slower development +in my experience, and when you mix in the added cost of not introducing +breaking changes all of the time, things can get quite complicated. But, I +don't think anyone has ever release the internals of a regex engine as a +library before. So it will be interesting to see how it plays out! diff --git a/regex-automata/src/dfa/automaton.rs b/regex-automata/src/dfa/automaton.rs index fe3a7fa497..2be080425d 100644 --- a/regex-automata/src/dfa/automaton.rs +++ b/regex-automata/src/dfa/automaton.rs @@ -132,11 +132,7 @@ pub unsafe trait Automaton { /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// // - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. - /// let mut state = - /// dfa.start_state_forward(&Input::new(haystack))?.unwrap(); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); @@ -214,8 +210,7 @@ pub unsafe trait Automaton { /// // /// // The unwrap is OK because we aren't requesting a start state for a /// // specific pattern. - /// let mut state = - /// dfa.start_state_forward(&Input::new(haystack))?.unwrap(); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); @@ -232,8 +227,7 @@ pub unsafe trait Automaton { fn next_eoi_state(&self, current: StateID) -> StateID; /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. If a match is known to be impossible while computing - /// the start state, then `None` is returned. + /// forward search. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -257,11 +251,10 @@ pub unsafe trait Automaton { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result, MatchError>; + ) -> Result; /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. If a match is known to be impossible while computing - /// the start state, then `None` is returned. + /// reverse search. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -285,7 +278,7 @@ pub unsafe trait Automaton { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result, MatchError>; + ) -> Result; /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that @@ -386,12 +379,7 @@ pub unsafe trait Automaton { /// // initial bytes of the haystack. Note that start states can never /// // be match states (since DFAs in this crate delay matches by 1 /// // byte), so we don't need to check if the start state is a match. - /// // - /// // Also, we unwrap this because the only way to get a None start - /// // state ID is if we asked to search for a pattern that isn't in - /// // this DFA, but we don't use that functionality here. - /// let mut state = - /// dfa.start_state_forward(&Input::new(haystack))?.unwrap(); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// let mut last_match = None; /// // Walk all the bytes in the haystack. We can quit early if we see /// // a dead or a quit state. The former means the automaton will @@ -629,8 +617,7 @@ pub unsafe trait Automaton { /// // See the Automaton::is_special_state example for similar code /// // with more comments. /// - /// let mut state = - /// dfa.start_state_forward(&Input::new(haystack))?.unwrap(); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// let mut last_match = None; /// let mut pos = 0; /// while pos < haystack.len() { @@ -844,11 +831,7 @@ pub unsafe trait Automaton { /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// // - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. - /// let mut state = - /// dfa.start_state_forward(&Input::new(haystack))?.unwrap(); + /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); @@ -1819,7 +1802,7 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { (**self).start_state_forward(input) } @@ -1827,7 +1810,7 @@ unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { (**self).start_state_reverse(input) } diff --git a/regex-automata/src/dfa/dense.rs b/regex-automata/src/dfa/dense.rs index 3bd2fdcc37..00086cc947 100644 --- a/regex-automata/src/dfa/dense.rs +++ b/regex-automata/src/dfa/dense.rs @@ -732,9 +732,7 @@ impl Config { /// .build(r"[a-z]+")?; /// /// let haystack = "123 foobar 4567".as_bytes(); - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. - /// let sid = dfa.start_state_forward(&Input::new(haystack))?.unwrap(); + /// let sid = dfa.start_state_forward(&Input::new(haystack))?; /// // The ID returned by 'start_state_forward' will always be tagged as /// // a start state when start state specialization is enabled. /// assert!(dfa.is_special_state(sid)); @@ -753,9 +751,7 @@ impl Config { /// let dfa = DFA::new(r"[a-z]+")?; /// /// let haystack = "123 foobar 4567"; - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. - /// let sid = dfa.start_state_forward(&Input::new(haystack))?.unwrap(); + /// let sid = dfa.start_state_forward(&Input::new(haystack))?; /// // Start states are not special in the default configuration! /// assert!(!dfa.is_special_state(sid)); /// assert!(!dfa.is_start_state(sid)); @@ -2887,10 +2883,7 @@ impl OwnedDFA { let start_id = |dfa: &mut OwnedDFA, inp: &Input<'_>, start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. - dfa.st - .start(inp, start) - .expect("valid Input configuration") - .expect("valid start state") + dfa.st.start(inp, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { let inp = Input::new("").anchored(Anchored::No); @@ -3220,7 +3213,7 @@ unsafe impl> Automaton for DFA { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { if !self.quitset.is_empty() && input.start() > 0 { let offset = input.start() - 1; let byte = input.haystack()[offset]; @@ -3236,7 +3229,7 @@ unsafe impl> Automaton for DFA { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { if !self.quitset.is_empty() && input.end() < input.haystack().len() { let offset = input.end(); let byte = input.haystack()[offset]; @@ -4183,7 +4176,7 @@ impl> StartTable { &self, input: &Input<'_>, start: Start, - ) -> Result, MatchError> { + ) -> Result { let start_index = start.as_usize(); let mode = input.get_anchored(); let index = match mode { @@ -4207,14 +4200,14 @@ impl> StartTable { Some(len) => len, }; if pid.as_usize() >= len { - return Ok(None); + return Ok(DEAD); } (2 * self.stride) + (self.stride * pid.as_usize()) + start_index } }; - Ok(Some(self.table()[index])) + Ok(self.table()[index]) } /// Returns an iterator over all start state IDs in this table. diff --git a/regex-automata/src/dfa/mod.rs b/regex-automata/src/dfa/mod.rs index 31ca5961f7..c289567d05 100644 --- a/regex-automata/src/dfa/mod.rs +++ b/regex-automata/src/dfa/mod.rs @@ -216,8 +216,8 @@ you would any regex. Deserialization can happen anywhere. For example, with bytes embedded into a binary or with a file memory mapped at runtime. -TODO: Include link to `regex-cli` here pointing out how to generate Rust code -for deserializing DFAs. +The `regex-cli` command (found in the same repository as this crate) can be +used to serialize DFAs to files and generate Rust code to read them. # Syntax diff --git a/regex-automata/src/dfa/onepass.rs b/regex-automata/src/dfa/onepass.rs index 5563c0ff26..44691d0c8a 100644 --- a/regex-automata/src/dfa/onepass.rs +++ b/regex-automata/src/dfa/onepass.rs @@ -51,7 +51,7 @@ use crate::{ int::{Usize, U32, U64, U8}, look::{Look, LookSet, UnicodeWordBoundaryError}, primitives::{NonMaxUsize, PatternID, StateID}, - search::{Anchored, Input, MatchError, MatchKind}, + search::{Anchored, Input, Match, MatchError, MatchKind, Span}, sparse_set::SparseSet, }, }; @@ -1630,6 +1630,91 @@ impl DFA { self.try_search_slots(cache, &input, &mut []).unwrap().is_some() } + /// Executes an anchored leftmost forward search, and returns a `Match` if + /// and only if this one-pass DFA matches the given haystack. + /// + /// This routine only includes the overall match span. To get access to the + /// individual spans of each capturing group, use [`DFA::captures`]. + /// + /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the + /// given configuration was [`Anchored::No`] (which is the default). + /// + /// # Panics + /// + /// This routine panics if the search could not complete. This can occur + /// in the following circumstances: + /// + /// * When the provided `Input` configuration is not supported. For + /// example, by providing an unsupported anchor mode. Concretely, + /// this occurs when using [`Anchored::Pattern`] without enabling + /// [`Config::starts_for_each_pattern`]. + /// + /// When a search panics, callers cannot know whether a match exists or + /// not. + /// + /// Use [`DFA::try_search`] if you want to handle these panics as error + /// values instead. + /// + /// # Example + /// + /// Leftmost first match semantics corresponds to the match with the + /// smallest starting offset, but where the end offset is determined by + /// preferring earlier branches in the original regular expression. For + /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` + /// will match `Samwise` in `Samwise`. + /// + /// Generally speaking, the "leftmost first" match is how most backtracking + /// regular expressions tend to work. This is in contrast to POSIX-style + /// regular expressions that yield "leftmost longest" matches. Namely, + /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using + /// leftmost longest semantics. (This crate does not currently support + /// leftmost longest semantics.) + /// + /// ``` + /// use regex_automata::{dfa::onepass::DFA, Match}; + /// + /// let re = DFA::new("foo[0-9]+")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..8); + /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); + /// + /// // Even though a match is found after reading the first byte (`a`), + /// // the leftmost first match semantics demand that we find the earliest + /// // match that prefers earlier parts of the pattern over later parts. + /// let re = DFA::new("abc|a")?; + /// let mut cache = re.create_cache(); + /// let expected = Match::must(0, 0..3); + /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn find<'h, I: Into>>( + &self, + cache: &mut Cache, + input: I, + ) -> Option { + let mut input = input.into(); + if matches!(input.get_anchored(), Anchored::No) { + input.set_anchored(Anchored::Yes); + } + if self.get_nfa().pattern_len() == 1 { + let mut slots = [None, None]; + let pid = + self.try_search_slots(cache, &input, &mut slots).unwrap()?; + let start = slots[0].unwrap().get(); + let end = slots[1].unwrap().get(); + return Some(Match::new(pid, Span { start, end })); + } + let ginfo = self.get_nfa().group_info(); + let slots_len = ginfo.implicit_slot_len(); + let mut slots = vec![None; slots_len]; + let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?; + let start = slots[pid.as_usize() * 2].unwrap().get(); + let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); + Some(Match::new(pid, Span { start, end })) + } + /// Executes an anchored leftmost forward search and writes the spans /// of capturing groups that participated in a match into the provided /// [`Captures`] value. If no match was found, then [`Captures::is_match`] @@ -2031,10 +2116,7 @@ impl DFA { let mut pid = None; let mut next_sid = match input.get_anchored() { Anchored::Yes => self.start(), - Anchored::Pattern(pid) => match self.start_pattern(pid)? { - None => return Ok(None), - Some(sid) => sid, - }, + Anchored::Pattern(pid) => self.start_pattern(pid)?, Anchored::No => { // If the regex is itself always anchored, then we're fine, // even if the search is configured to be unanchored. @@ -2153,10 +2235,7 @@ impl DFA { /// 'starts_for_each_pattern' /// was not enabled, then this returns an error. If the given pattern is /// not in this DFA, then `Ok(None)` is returned. - fn start_pattern( - &self, - pid: PatternID, - ) -> Result, MatchError> { + fn start_pattern(&self, pid: PatternID) -> Result { if !self.config.get_starts_for_each_pattern() { return Err(MatchError::unsupported_anchored(Anchored::Pattern( pid, @@ -2168,7 +2247,7 @@ impl DFA { // patterns at pid+1. Thus, starts.len()-1 corresponds to the total // number of patterns that one can explicitly search for. (And it may // be zero.) - Ok(self.starts.get(pid.one_more()).copied()) + Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD)) } /// Returns the transition from the given state ID and byte of input. The diff --git a/regex-automata/src/dfa/search.rs b/regex-automata/src/dfa/search.rs index c975921c9f..8c012a5944 100644 --- a/regex-automata/src/dfa/search.rs +++ b/regex-automata/src/dfa/search.rs @@ -51,10 +51,7 @@ fn find_fwd_imp( // See 'prefilter_restart' docs for explanation. let universal_start = dfa.universal_start_state(Anchored::No).is_some(); let mut mat = None; - let mut sid = match init_fwd(dfa, input)? { - None => return Ok(None), - Some(sid) => sid, - }; + let mut sid = init_fwd(dfa, input)?; let mut at = input.start(); // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety @@ -211,10 +208,7 @@ fn find_rev_imp( earliest: bool, ) -> Result, MatchError> { let mut mat = None; - let mut sid = match init_rev(dfa, input)? { - None => return Ok(None), - Some(sid) => sid, - }; + let mut sid = init_rev(dfa, input)?; // In reverse search, the loop below can't handle the case of searching an // empty slice. Ideally we could write something congruent to the forward // search, i.e., 'while at >= start', but 'start' might be 0. Since we use @@ -350,10 +344,7 @@ fn find_overlapping_fwd_imp( let mut sid = match state.id { None => { state.at = input.start(); - match init_fwd(dfa, input)? { - None => return Ok(()), - Some(sid) => sid, - } + init_fwd(dfa, input)? } Some(sid) => { if let Some(match_index) = state.next_match_index { @@ -402,9 +393,12 @@ fn find_overlapping_fwd_imp( } } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); - state.at = - accel::find_fwd(needles, input.haystack(), state.at) - .unwrap_or(input.end()); + state.at = accel::find_fwd( + needles, + input.haystack(), + state.at + 1, + ) + .unwrap_or(input.end()); continue; } } else if dfa.is_match_state(sid) { @@ -421,8 +415,9 @@ fn find_overlapping_fwd_imp( // byte values. However, there might be an EOI transition. So // we set 'at' to the end of the haystack, which will cause // this loop to stop and fall down into the EOI transition. - state.at = accel::find_fwd(needs, input.haystack(), state.at) - .unwrap_or(input.end()); + state.at = + accel::find_fwd(needs, input.haystack(), state.at + 1) + .unwrap_or(input.end()); continue; } else if dfa.is_dead_state(sid) { return Ok(()); @@ -461,10 +456,7 @@ pub(crate) fn find_overlapping_rev( } let mut sid = match state.id { None => { - let sid = match init_rev(dfa, input)? { - None => return Ok(()), - Some(sid) => sid, - }; + let sid = init_rev(dfa, input)?; state.id = Some(sid); if input.start() == input.end() { state.rev_eoi = true; @@ -564,30 +556,24 @@ pub(crate) fn find_overlapping_rev( fn init_fwd( dfa: &A, input: &Input<'_>, -) -> Result, MatchError> { - let sid = match dfa.start_state_forward(input)? { - None => return Ok(None), - Some(sid) => sid, - }; +) -> Result { + let sid = dfa.start_state_forward(input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!dfa.is_match_state(sid)); - Ok(Some(sid)) + Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_rev( dfa: &A, input: &Input<'_>, -) -> Result, MatchError> { - let sid = match dfa.start_state_reverse(input)? { - None => return Ok(None), - Some(sid) => sid, - }; +) -> Result { + let sid = dfa.start_state_reverse(input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!dfa.is_match_state(sid)); - Ok(Some(sid)) + Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -664,5 +650,5 @@ fn prefilter_restart( ) -> Result { let mut input = input.clone(); input.set_start(at); - init_fwd(dfa, &input).map(|start| start.unwrap()) + init_fwd(dfa, &input) } diff --git a/regex-automata/src/dfa/sparse.rs b/regex-automata/src/dfa/sparse.rs index bd6ee38578..5d8ec23408 100644 --- a/regex-automata/src/dfa/sparse.rs +++ b/regex-automata/src/dfa/sparse.rs @@ -1210,7 +1210,7 @@ unsafe impl> Automaton for DFA { fn start_state_forward( &self, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { if !self.quitset.is_empty() && input.start() > 0 { let offset = input.start() - 1; let byte = input.haystack()[offset]; @@ -1226,7 +1226,7 @@ unsafe impl> Automaton for DFA { fn start_state_reverse( &self, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { if !self.quitset.is_empty() && input.end() < input.haystack().len() { let offset = input.end(); let byte = input.haystack()[offset]; @@ -2147,7 +2147,7 @@ impl> StartTable { &self, input: &Input<'_>, start: Start, - ) -> Result, MatchError> { + ) -> Result { let start_index = start.as_usize(); let mode = input.get_anchored(); let index = match mode { @@ -2171,7 +2171,7 @@ impl> StartTable { Some(len) => len, }; if pid.as_usize() >= len { - return Ok(None); + return Ok(DEAD); } (2 * self.stride) + (self.stride * pid.as_usize()) @@ -2181,7 +2181,7 @@ impl> StartTable { let start = index * StateID::SIZE; // This OK since we're allowed to assume that the start table contains // valid StateIDs. - Ok(Some(wire::read_state_id_unchecked(&self.table()[start..]).0)) + Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) } /// Return an iterator over all start IDs in this table. diff --git a/regex-automata/src/hybrid/dfa.rs b/regex-automata/src/hybrid/dfa.rs index 246e9f55fd..874b511e23 100644 --- a/regex-automata/src/hybrid/dfa.rs +++ b/regex-automata/src/hybrid/dfa.rs @@ -1197,12 +1197,9 @@ impl DFA { /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// // - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), - /// )?.unwrap(); + /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// sid = dfa.next_state(&mut cache, sid, b)?; @@ -1300,12 +1297,9 @@ impl DFA { /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// // - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), - /// )?.unwrap(); + /// )?; /// // Walk all the bytes in the haystack. /// let mut at = 0; /// while at < haystack.len() { @@ -1492,12 +1486,9 @@ impl DFA { /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// // - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), - /// )?.unwrap(); + /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// sid = dfa.next_state(&mut cache, sid, b)?; @@ -1528,8 +1519,7 @@ impl DFA { } /// Return the ID of the start state for this lazy DFA when executing a - /// forward search. If a match is known to be impossible while computing - /// the start state, then `None` is returned. + /// forward search. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1547,15 +1537,16 @@ impl DFA { /// # Errors /// /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if it - /// sees a "quit" byte). This can also return an error if the given `Input` - /// contains an unsupported [`Anchored`] configuration. + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { if !self.quitset.is_empty() && input.start() > 0 { let offset = input.start() - 1; let byte = input.haystack()[offset]; @@ -1566,19 +1557,14 @@ impl DFA { let start_type = self.start_map.fwd(input); let start = LazyRef::new(self, cache) .get_cached_start_id(input, start_type)?; - let sid = match start { - None => return Ok(None), - Some(sid) => sid, - }; - if !sid.is_unknown() { - return Ok(Some(sid)); + if !start.is_unknown() { + return Ok(start); } Lazy::new(self, cache).cache_start_group(input, start_type) } /// Return the ID of the start state for this lazy DFA when executing a - /// reverse search. If a match is known to be impossible while computing - /// the start state, then `None` is returned. + /// reverse search. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: @@ -1596,15 +1582,16 @@ impl DFA { /// # Errors /// /// This may return a [`MatchError`] (not a [`CacheError`]!) if the search - /// needs to give up when determining the start state (for example, if it - /// sees a "quit" byte). This can also return an error if the given `Input` - /// contains an unsupported [`Anchored`] configuration. + /// needs to give up when determining the start state (for example, if + /// it sees a "quit" byte or if the cache has been cleared too many + /// times). This can also return an error if the given `Input` contains an + /// unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, - ) -> Result, MatchError> { + ) -> Result { if !self.quitset.is_empty() && input.end() < input.haystack().len() { let offset = input.end(); let byte = input.haystack()[offset]; @@ -1615,12 +1602,8 @@ impl DFA { let start_type = self.start_map.rev(input); let start = LazyRef::new(self, cache) .get_cached_start_id(input, start_type)?; - let sid = match start { - None => return Ok(None), - Some(sid) => sid, - }; - if !sid.is_unknown() { - return Ok(Some(sid)); + if !start.is_unknown() { + return Ok(start); } Lazy::new(self, cache).cache_start_group(input, start_type) } @@ -1673,12 +1656,9 @@ impl DFA { /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. - /// // - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), - /// )?.unwrap(); + /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// sid = dfa.next_state(&mut cache, sid, b)?; @@ -1954,7 +1934,7 @@ impl Cache { /// This panics if no search has been started by [`Cache::search_start`]. #[inline] pub fn search_update(&mut self, at: usize) { - let mut p = + let p = self.progress.as_mut().expect("no in-progress search to update"); p.at = at; } @@ -2144,7 +2124,7 @@ impl<'i, 'c> Lazy<'i, 'c> { &mut self, input: &Input<'_>, start: Start, - ) -> Result, MatchError> { + ) -> Result { let mode = input.get_anchored(); let nfa_start_id = match mode { Anchored::No => self.dfa.get_nfa().start_unanchored(), @@ -2154,7 +2134,7 @@ impl<'i, 'c> Lazy<'i, 'c> { return Err(MatchError::unsupported_anchored(mode)); } match self.dfa.get_nfa().start_pattern(pid) { - None => return Ok(None), + None => return Ok(self.as_ref().dead_id()), Some(sid) => sid, } } @@ -2164,7 +2144,7 @@ impl<'i, 'c> Lazy<'i, 'c> { .cache_start_one(nfa_start_id, start) .map_err(|_| MatchError::gave_up(input.start()))?; self.set_start_state(input, start, id); - Ok(Some(id)) + Ok(id) } /// Compute and cache the starting state for the given NFA state ID and the @@ -2664,7 +2644,7 @@ impl<'i, 'c> LazyRef<'i, 'c> { &self, input: &Input<'_>, start: Start, - ) -> Result, MatchError> { + ) -> Result { let start_index = start.as_usize(); let mode = input.get_anchored(); let index = match mode { @@ -2675,14 +2655,14 @@ impl<'i, 'c> LazyRef<'i, 'c> { return Err(MatchError::unsupported_anchored(mode)); } if pid.as_usize() >= self.dfa.pattern_len() { - return Ok(None); + return Ok(self.dead_id()); } (2 * Start::len()) + (Start::len() * pid.as_usize()) + start_index } }; - Ok(Some(self.cache.starts[index])) + Ok(self.cache.starts[index]) } /// Return the cached NFA/DFA powerset state for the given ID. @@ -3401,11 +3381,7 @@ impl Config { /// let mut cache = dfa.create_cache(); /// /// let haystack = "123 foobar 4567".as_bytes(); - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. - /// let sid = dfa.start_state_forward( - /// &mut cache, &Input::new(haystack), - /// ).map_err(|_| MatchError::gave_up(0))?.unwrap(); + /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; /// // The ID returned by 'start_state_forward' will always be tagged as /// // a start state when start state specialization is enabled. /// assert!(sid.is_tagged()); @@ -3425,11 +3401,7 @@ impl Config { /// let mut cache = dfa.create_cache(); /// /// let haystack = "123 foobar 4567".as_bytes(); - /// // The unwrap is OK because we aren't requesting a start state for a - /// // specific pattern. - /// let sid = dfa.start_state_forward( - /// &mut cache, &Input::new(haystack), - /// ).map_err(|_| MatchError::gave_up(0))?.unwrap(); + /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; /// // Start states are not tagged in the default configuration! /// assert!(!sid.is_tagged()); /// assert!(!sid.is_start()); diff --git a/regex-automata/src/hybrid/id.rs b/regex-automata/src/hybrid/id.rs index f564f89a65..662e3c98f0 100644 --- a/regex-automata/src/hybrid/id.rs +++ b/regex-automata/src/hybrid/id.rs @@ -71,13 +71,10 @@ /// // initial bytes of the haystack. Note that start states can never /// // be match states (since DFAs in this crate delay matches by 1 /// // byte), so we don't need to check if the start state is a match. -/// // -/// // The unwrap is OK because we aren't requesting a start state for a -/// // specific pattern. /// let mut sid = dfa.start_state_forward( /// cache, /// &Input::new(haystack), -/// ).map_err(|_| MatchError::gave_up(0))?.unwrap(); +/// )?; /// let mut last_match = None; /// // Walk all the bytes in the haystack. We can quit early if we see /// // a dead or a quit state. The former means the automaton will diff --git a/regex-automata/src/hybrid/search.rs b/regex-automata/src/hybrid/search.rs index aad5c25c37..0d0bb8af5b 100644 --- a/regex-automata/src/hybrid/search.rs +++ b/regex-automata/src/hybrid/search.rs @@ -57,10 +57,7 @@ fn find_fwd_imp( // See 'prefilter_restart' docs for explanation. let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); let mut mat = None; - let mut sid = match init_fwd(dfa, cache, input)? { - None => return Ok(None), - Some(sid) => sid, - }; + let mut sid = init_fwd(dfa, cache, input)?; let mut at = input.start(); // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety @@ -319,10 +316,7 @@ fn find_rev_imp( earliest: bool, ) -> Result, MatchError> { let mut mat = None; - let mut sid = match init_rev(dfa, cache, input)? { - None => return Ok(None), - Some(sid) => sid, - }; + let mut sid = init_rev(dfa, cache, input)?; // In reverse search, the loop below can't handle the case of searching an // empty slice. Ideally we could write something congruent to the forward // search, i.e., 'while at >= start', but 'start' might be 0. Since we use @@ -481,10 +475,7 @@ fn find_overlapping_fwd_imp( let mut sid = match state.id { None => { state.at = input.start(); - match init_fwd(dfa, cache, input)? { - None => return Ok(()), - Some(sid) => sid, - } + init_fwd(dfa, cache, input)? } Some(sid) => { if let Some(match_index) = state.next_match_index { @@ -585,10 +576,7 @@ pub(crate) fn find_overlapping_rev( } let mut sid = match state.id { None => { - let sid = match init_rev(dfa, cache, input)? { - None => return Ok(()), - Some(sid) => sid, - }; + let sid = init_rev(dfa, cache, input)?; state.id = Some(sid); if input.start() == input.end() { state.rev_eoi = true; @@ -680,15 +668,12 @@ fn init_fwd( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, -) -> Result, MatchError> { - let sid = match dfa.start_state_forward(cache, input)? { - None => return Ok(None), - Some(sid) => sid, - }; +) -> Result { + let sid = dfa.start_state_forward(cache, input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!sid.is_match()); - Ok(Some(sid)) + Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -696,15 +681,12 @@ fn init_rev( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, -) -> Result, MatchError> { - let sid = match dfa.start_state_reverse(cache, input)? { - None => return Ok(None), - Some(sid) => sid, - }; +) -> Result { + let sid = dfa.start_state_reverse(cache, input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!sid.is_match()); - Ok(Some(sid)) + Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] @@ -810,13 +792,7 @@ fn prefilter_restart( ) -> Result { let mut input = input.clone(); input.set_start(at); - // We can unwrap the inner state ID because restarting a prefilter comes - // after the initial computation of the start state, which we know already - // succeeded by virtue of running the prefilter. Thus, we know it will - // succeed to find a start state again. (The only way it can't is if - // the pattern ID doesn't exist in this DFA, but that configuration is - // invariant throughout the lifetime of a search.) - init_fwd(dfa, cache, &input).map(|start| start.unwrap()) + init_fwd(dfa, cache, &input) } /// A convenience routine for constructing a "gave up" match error. diff --git a/regex-automata/src/lib.rs b/regex-automata/src/lib.rs index fd869816ea..85fa27d736 100644 --- a/regex-automata/src/lib.rs +++ b/regex-automata/src/lib.rs @@ -1,14 +1,540 @@ /*! -This crate provides an "expert" API for executing regular expressions using -finite automata. - -**WARNING**: This `0.2` release of `regex-automata` was published -before it was ready to unblock work elsewhere that needed some -of the new APIs in this release. At the time of writing, it is -strongly preferred that you continue using the -[`regex-automata 0.1`](https://docs.rs/regex-automata/0.1/regex_automata/) -release. Since this release represents an unfinished state, please do not -create issues for this release unless it's for a critical bug. +This crate exposes a variety of regex engines used by the `regex` crate. +It provides a vast, sprawling and "expert" level API to each regex engine. +The regex engines provided by this crate focus heavily on finite automata +implementations and specifically guarantee worst case `O(m * n)` time +complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.) + +The primary goal of this crate is to serve as an implementation detail for the +`regex` crate. A secondary goal is to make its internals available for use by +others. + +# Table of contents + +* [Should I be using this crate?](#should-i-be-using-this-crate) gives some +reasons for and against using this crate. +* [Examples](#examples) provides a small selection of things you can do with +this crate. +* [Available regex engines](#available-regex-engines) provides a hyperlinked +list of all regex engines in this crate. +* [API themes](#api-themes) discusses common elements used throughout this +crate. +* [Crate features](#crate-features) documents the extensive list of Cargo +features available. + +# Should I be using this crate? + +If you find yourself here because you just want to use regexes, then you should +first check out whether the [`regex` crate](https://docs.rs/regex) meets +your needs. It provides a streamlined and difficult-to-misuse API for regex +searching. + +If you're here because there is something specific you want to do that can't +be easily done with `regex` crate, then you are perhaps in the right place. +It's most likely that the first stop you'll want to make is to explore the +[`meta` regex APIs](meta). Namely, the `regex` crate is just a light wrapper +over a [`meta::Regex`], so its API will probably be the easiest to transition +to. In contrast to the `regex` crate, the `meta::Regex` API supports more +search parameters and does multi-pattern searches. However, it isn't quite as +ergonomic. + +Otherwise, the following is an inexhaustive list of reasons to use this crate: + +* You want to analyze or use a [Thompson `NFA`](nfa::thompson::NFA) directly. +* You want more powerful multi-pattern search than what is provided by +`RegexSet` in the `regex` crate. All regex engines in this crate support +multi-pattern searches. +* You want to use one of the `regex` crate's internal engines directly because +of some interesting configuration that isn't possible via the `regex` crate. +For example, a [lazy DFA's configuration](hybrid::dfa::Config) exposes a +dizzying number of options for controlling its execution. +* You want to use the lower level search APIs. For example, both the [lazy +DFA](hybrid::dfa) and [fully compiled DFAs](dfa) support searching by exploring +the automaton one state at a time. This might be useful, for example, for +stream searches or searches of strings stored in non-contiguous in memory. +* You want to build a fully compiled DFA and then [use zero-copy +deserialization](dfa::dense::DFA::from_bytes) to load it into memory and use +it for searching. This use case is supported in core-only no-std/no-alloc +environments. +* You want to run [anchored searches](Input::anchored) without using the `^` +anchor in your regex pattern. +* You need to work-around contention issues with +sharing a regex across multiple threads. The +[`meta::Regex::search_with`](meta::Regex::search_with) API permits bypassing +any kind of synchronization at all by requiring the caller to provide the +mutable scratch spaced needed during a search. +* You want to build your own regex engine on top of the `regex` crate's +infrastructure. + +# Examples + +This section tries to identify a few interesting things you can do with this +crate and demonstrates them. + +### Multi-pattern searches with capture groups + +One of the more frustrating limitations of `RegexSet` in the `regex` crate +(at the time of writing) is that it doesn't report match positions. With this +crate, multi-pattern support was intentionally designed in from the beginning, +which means it works in all regex engines and even for capture groups as well. + +This example shows how to search for matches of multiple regexes, where each +regex uses the same capture group names to parse different key-value formats. + +``` +use regex_automata::{meta::Regex, PatternID}; + +let re = Regex::new_many(&[ + r#"(?m)^(?[[:word:]]+)=(?[[:word:]]+)$"#, + r#"(?m)^(?[[:word:]]+)="(?[^"]+)"$"#, + r#"(?m)^(?[[:word:]]+)='(?[^']+)'$"#, + r#"(?m)^(?[[:word:]]+):\s*(?[[:word:]]+)$"#, +])?; +let hay = r#" +best_album="Blow Your Face Out" +best_quote='"then as it was, then again it will be"' +best_year=1973 +best_simpsons_episode: HOMR +"#; +let mut kvs = vec![]; +for caps in re.captures_iter(hay) { + // N.B. One could use capture indices '1' and '2' here + // as well. Capture indices are local to each pattern. + // (Just like names are.) + let key = &hay[caps.get_group_by_name("key").unwrap()]; + let val = &hay[caps.get_group_by_name("val").unwrap()]; + kvs.push((key, val)); +} +assert_eq!(kvs, vec![ + ("best_album", "Blow Your Face Out"), + ("best_quote", "\"then as it was, then again it will be\""), + ("best_year", "1973"), + ("best_simpsons_episode", "HOMR"), +]); + +# Ok::<(), Box>(()) +``` + +### Build a full DFA and walk it manually + +One of the regex engines in this crate is a fully compiled DFA. It takes worst +case exponential time to build, but once built, it can be easily explored and +used for searches. Here's a simple example that uses its lower level APIs to +implement a simple anchored search by hand. + +``` +use regex_automata::{dfa::{Automaton, dense}, Input}; + +let dfa = dense::DFA::new(r"(?-u)\b[A-Z]\w+z\b")?; +let haystack = "Quartz"; + +// The start state is determined by inspecting the position and the +// initial bytes of the haystack. +let mut state = dfa.start_state_forward(&Input::new(haystack))?; +// Walk all the bytes in the haystack. +for &b in haystack.as_bytes().iter() { + state = dfa.next_state(state, b); +} +// DFAs in this crate require an explicit +// end-of-input transition if a search reaches +// the end of a haystack. +state = dfa.next_eoi_state(state); +assert!(dfa.is_match_state(state)); + +# Ok::<(), Box>(()) +``` + +Or do the same with a lazy DFA that avoids exponential worst case compile time, +but requires mutable scratch space to lazily build the DFA during the search. + +``` +use regex_automata::{hybrid::dfa::DFA, Input}; + +let dfa = DFA::new(r"(?-u)\b[A-Z]\w+z\b")?; +let mut cache = dfa.create_cache(); +let hay = "Quartz"; + +// The start state is determined by inspecting the position and the +// initial bytes of the haystack. +let mut state = dfa.start_state_forward(&mut cache, &Input::new(hay))?; +// Walk all the bytes in the haystack. +for &b in hay.as_bytes().iter() { + state = dfa.next_state(&mut cache, state, b)?; +} +// DFAs in this crate require an explicit +// end-of-input transition if a search reaches +// the end of a haystack. +state = dfa.next_eoi_state(&mut cache, state)?; +assert!(state.is_match()); + +# Ok::<(), Box>(()) +``` + +### Find all overlapping matches + +This example shows how to build a DFA and use it to find all possible matches, +including overlapping matches. A similar example will work with a lazy DFA as +well. This also works with multiple patterns and will report all matches at the +same position where multiple patterns match. + +``` +use regex_automata::{ + dfa::{dense, Automaton, OverlappingState}, + Input, MatchKind, +}; + +let dfa = dense::DFA::builder() + .configure(dense::DFA::config().match_kind(MatchKind::All)) + .build(r"(?-u)\w{3,}")?; +let input = Input::new("homer marge bart lisa maggie"); +let mut state = OverlappingState::start(); + +let mut matches = vec![]; +while let Some(hm) = { + dfa.try_search_overlapping_fwd(&input, &mut state)?; + state.get_match() +} { + matches.push(hm.offset()); +} +assert_eq!(matches, vec![ + 3, 4, 5, // hom, home, homer + 9, 10, 11, // mar, marg, marge + 15, 16, // bar, bart + 20, 21, // lis, lisa + 25, 26, 27, 28, // mag, magg, maggi, maggie +]); + +# Ok::<(), Box>(()) +``` + +# Available regex engines + +The following is a complete list of all regex engines provided by this crate, +along with a very brief description of it and why you might want to use it. + +* [`dfa::regex::Regex`] is a regex engine that works on top of either +[dense](dfa::dense) or [sparse](dfa::sparse) fully compiled DFAs. You might +use a DFA if you need the fastest possible regex engine in this crate and can +afford the exorbitant memory usage usually required by DFAs. Low level APIs on +fully compiled DFAs are provided by the [`Automaton` trait](dfa::Automaton). +Fully compiled dense DFAs can handle all regexes except for searching a regex +with a Unicode word boundary on non-ASCII haystacks. A fully compiled DFA based +regex can only report the start and end of each match. +* [`hybrid::regex::Regex`] is a regex engine that works on top of a lazily +built DFA. Its performance profile is very similar to that of fully compiled +DFAs, but can be slower in some pathological cases. Fully compiled DFAs are +also amenable to more optimizations, such as state acceleration, that aren't +available in a lazy DFA. You might use this lazy DFA if you can't abide the +worst case exponential compile time of a full DFA, but still want the DFA +search performance in the vast majority of cases. A lazy DFA based regex can +only report the start and end of each match. +* [`dfa::onepass::DFA`] is a regex engine that is implemented as a DFA, but +can report the matches of each capture group in addition to the start and end +of each match. The catch is that it only works on a somewhat small subset of +regexes known as "one-pass." You'll want to use this for cases when you need +capture group matches and the regex is one-pass since it is likely to be faster +than any alternative. A one-pass DFA can handle all types of regexes, but does +have some reasonable limits on the number of capture groups it can handle. +* [`nfa::thompson::backtrack::BoundedBacktracker`] is a regex engine that uses +backtracking, but keeps track of the work it has done to avoid catastrophic +backtracking. Like the one-pass DFA, it provides the matches of each capture +group. It retains the `O(m * n)` worst case time bound. This tends to be slower +than the one-pass DFA regex engine, but faster than the PikeVM. It can handle +all types of regexes, but usually only works well with small haystacks and +small regexes due to the memory required to avoid redoing work. +* [`nfa::thompson::pikevm::PikeVM`] is a regex engine that can handle all +regexes, of all sizes and provides capture group matches. It tends to be a tool +of last resort because it is also usually the slowest regex engine. +* [`meta::Regex`] is the meta regex engine that combines *all* of the above +engines into one. The reason for this is that each of the engines above have +their own caveats such as, "only handles a subset of regexes" or "is generally +slow." The meta regex engine accounts for all of these caveats and composes +the engines in a way that attempts to mitigate each engine's weaknesses while +emphasizing its strengths. For example, it will attempt to run a lazy DFA even +if it might fail. In which case, it will restart the search with a likely +slower but more capable regex engine. The meta regex engine is what you should +default to. Use one of the above engines directly only if you have a specific +reason to. + +# API themes + +While each regex engine has its own APIs and configuration options, there are +some general themes followed by all of them. + +### The `Input` abstraction + +Most search routines in this crate accept anything that implements +`Into`. Both `&str` and `&[u8]` haystacks satisfy this constraint, which +means that things like `engine.search("foo")` will work as you would expect. + +By virtue of accepting an `Into` though, callers can provide more than +just a haystack. Indeed, the [`Input`] type has more details, but briefly, +callers can use it to configure various aspects of the search: + +* The span of the haystack to search via [`Input::span`] or [`Input::range`], +which might be a substring of the haystack. +* Whether to run an anchored search or not via [`Input::anchored`]. This +permits one to require matches to start at the same offset that the search +started. +* Whether to ask the regex engine to stop as soon as a match is seen via +[`Input::earliest`]. This can be used to find the offset of a match as soon +as it is known without waiting for the full leftmost-first match to be found. +This can also be used to avoid the worst case `O(m * n^2)` time complexity +of iteration. + +Some lower level search routines accept an `&Input` for performance reasons. +In which case, `&Input::new("haystack")` can be used for a simple search. + +### Error reporting + +Most, but not all, regex engines in this crate can fail to execute a search. +When a search fails, callers cannot determine whether or not a match exists. +That is, the result is indeterminate. + +Search failure, in all cases in this crate, is represented by a [`MatchError`]. +Routines that can fail start with the `try_` prefix in their name. For example, +[`hybrid::regex::Regex::try_search`] can fail for a number of reasons. +Conversely, routines that either can't fail or can panic on failure lack the +`try_` prefix. For example, [`hybrid::regex::Regex::find`] will panic in +cases where [`hybrid::regex::Regex::try_search`] would return an error, and +[`meta::Regex::find`] will never panic. Therefore, callers need to pay close +attention to the panicking conditions in the documentation. + +In most cases, the reasons that a search fails are either predictable or +configurable, albeit at some additional cost. + +An example of predictable failure is +[`BoundedBacktracker::try_search`](nfa::thompson::backtrack::BoundedBacktracker::try_search). +Namely, it fails whenever the multiplication of the haystack, the regex and some +constant exceeds the +[configured visited capacity](nfa::thompson::backtrack::Config::visited_capacity). +Callers can predict the failure in terms of haystack length via the +[`BoundedBacktracker::max_haystack_len`](nfa::thompson::backtrack::BoundedBacktracker::max_haystack_len) +method. While this form of failure is technically avoidable by increasing the +visited capacity, it isn't practical to do so for all inputs because the +memory usage required for larger haystacks becomes impractically large. So in +practice, if one is using the bounded backtracker, you really do have to deal +with the failure. + +An example of configurable failure happens when one enables heuristic support +for Unicode word boundaries in a DFA. Namely, since the DFAs in this crate +(except for the one-pass DFA) do not support Unicode word boundaries on +non-ASCII haystacks, building a DFA from an NFA that contains a Unicode word +boundary will itself fail. However, one can configure DFAs to still be built in +this case by +[configuring heuristic support for Unicode word boundaries](hybrid::dfa::Config::unicode_word_boundary). +If the NFA the DFA is built from contains a Unicode word boundary, then the +DFA will still be built, but special transitions will be added to every state +that cause the DFA to fail if any non-ASCII byte is seen. This failure happens +at search time and it requires the caller to opt into this. + +There are other ways for regex engines to fail in this crate, but the above +two should represent the general theme of failures one can find. Dealing +with these failures is, in part, one the reaponsibilities of the [meta regex +engine](meta). Notice, for example, that the meta regex engine exposes an API +that never returns an error nor panics. It carefully manages all of the ways +in which the regex engines can fail and either avoids the predictable ones +entirely (e.g., the bounded backtracker) or reacts to configured failures by +falling back to a different engine (e.g., the lazy DFA quitting because it saw +a non-ASCII byte). + +### Configuration and Builders + +Most of the regex engines in this crate come with two types to facilitate +building the regex engine: a `Config` and a `Builder`. A `Config` is usually +specific to that particular regex engine, but other objects such as parsing and +NFA compilation have `Config` types too. A `Builder` is the thing responsible +for taking inputs (either pattern strings or already-parsed patterns or even +NFAs directly) and turning them into an actual regex engine that can be used +for searching. + +The main reason why building a regex engine is a bit complicated is because +of the desire to permit composition with de-coupled components. For example, +you might want to [manually construct a Thompson NFA](nfa::thompson::Builder) +and then build a regex engine from it without ever using a regex parser +at all. On the other hand, you might also want to build a regex engine directly +from the concrete syntax. This demonstrates why regex engine construction is +so flexible: it needs to support not just convenient construction, but also +construction from parts built elsewhere. + +This is also in turn why there are many different `Config` structs in this +crate. Let's look more closely at an example: [`hybrid::regex::Builder`]. It +accepts three different `Config` types for configuring construction of a lazy +DFA regex: + +* [`hybrid::regex::Builder::syntax`] accepts a +[`util::syntax::Config`] for configuring the options found in the +[`regex-syntax`](regex_syntax) crate. For example, whether to match +case insensitively. +* [`hybrid::regex::Builder::thompson`] accepts a [`nfa::thompson::Config`] for +configuring construction of a [Thompson NFA](nfa::thompson::NFA). For example, +whether to build an NFA that matches the reverse language described by the +regex. +* [`hybrid::regex::Builder::dfa`] accept a [`hybrid::dfa::Config`] for +configuring construction of the pair of underlying lazy DFAs that make up the +lazy DFA regex engine. For example, changing the capacity of the cache used to +store the transition table. + +The lazy DFA regex engine uses all three of those configuration objects for +methods like [`hybrid::regex::Builder::build`], which accepts a pattern +string containing the concrete syntax of your regex. It uses the syntax +configuration to parse it into an AST and translate it into an HIR. Then the +NFA configuration when compiling the HIR into an NFA. And then finally the DFA +configuration when lazily determinizing the NFA into a DFA. + +Notice though that the builder also has a +[`hybrid::regex::Builder::build_from_dfas`] constructor. This permits callers +to build the underlying pair of lazy DFAs themselves (one for the forward +searching to find the end of a match and one for the reverse searching to find +the start of a match), and then build the regex engine from them. The lazy +DFAs, in turn, have their own builder that permits [construction directly from +a Thompson NFA](hybrid::dfa::Builder::build_from_nfa). Continuing down the +rabbit hole, a Thompson NFA has its own compiler that permits [construction +directly from an HIR](nfa::thompson::Compiler::build_from_hir). The lazy DFA +regex engine builder lets you follow this rabbit hole all the way down, but +also provides convenience routines that do it for you when you don't need +precise control over every component. + +The [meta regex engine](meta) is a good example of something that utilizes the +full flexibility of these builders. It often needs not only precise control +over each component, but also shares them across multiple regex engines. +(Most sharing is done by internal reference accounting. For example, an +[`NFA`](nfa::thompson::NFA) is reference counted internally which makes cloning +cheap.) + +### Size limits + +Unlike the `regex` crate, the `regex-automata` crate specifically does not +enable any size limits by default. That means users of this crate need to +be quite careful when using untrusted patterns. Namely, because bounded +repetitions can grow exponentially by stacking them, it is possible to build a +very large internal regex object from just a small pattern string. For example, +the NFA built from the pattern `a{10}{10}{10}{10}{10}{10}{10}` is over 240MB. + +There are multiple size limit options in this crate. If one or more size limits +are relevant for the object you're building, they will be configurable via +methods on a corresponding `Config` type. + +# Crate features + +This crate has a dizzying number of features. The main idea is to be able to +control how much stuff you pull in for your specific use case, since the full +crate is quite large and can dramatically increase compile times and binary +size. + +The most barebones but useful configuration is to disable all default features +and enable only `dfa-search`. This will bring in just the DFA deserialization +and search routines without any dependency on `std` or `alloc`. This does +require generating and serializing a DFA, and then storing it somewhere, but +it permits regex searches in freestanding or embedded environments. + +Because there are so many features, they are split into a few groups. + +The default set of features is: `std`, `syntax`, `perf`, `unicode`, `meta`, +`nfa`, `dfa` and `hybrid`. Basically, the default is to enable everything +except for development related features like `logging`. + +### Ecosystem features + +* **std** - Enables use of the standard library. In terms of APIs, this usually +just means that error types implement the `std::error::Error` trait. Otherwise, +`std` sometimes enables the code to be faster, for example, using a `HashMap` +instead of a `BTreeMap`. (The `std` feature matters more for dependencies like +`aho-corasick` and `memchr`, where `std` is required to enable certain classes +of SIMD optimizations.) Enabling `std` automatically enables `alloc`. +* **alloc** - Enables use of the `alloc` library. This is required for most +APIs in this crate. The main exception is deserializing and searching with +fully compiled DFAs. +* **logging** - Adds a dependency on the `log` crate and makes this crate emit +log messages of varying degrees of utility. The log messages are especially +useful in trying to understand what the meta regex engine is doing. + +### Performance features + +* **perf** - Enables all of the below features. +* **perf-inline** - When enabled, `inline(always)` is used in (many) strategic +locations to help performance at the expense of longer compile times and +increased binary size. +* **perf-literal** - Enables all literal related optimizations. + * **perf-literal-substring** - Enables all single substring literal + optimizations. This includes adding a dependency on the `memchr` crate. + * **perf-literal-multisubstring** - Enables all multiple substring literal + optimizations. This includes adding a dependency on the `aho-corasick` + crate. + +### Unicode features + +* **unicode** - + Enables all Unicode features. This feature is enabled by default, and will + always cover all Unicode features, even if more are added in the future. +* **unicode-age** - + Provide the data for the + [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). + This makes it possible to use classes like `\p{Age:6.0}` to refer to all + codepoints first introduced in Unicode 6.0 +* **unicode-bool** - + Provide the data for numerous Unicode boolean properties. The full list + is not included here, but contains properties like `Alphabetic`, `Emoji`, + `Lowercase`, `Math`, `Uppercase` and `White_Space`. +* **unicode-case** - + Provide the data for case insensitive matching using + [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). +* **unicode-gencat** - + Provide the data for + [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). + This includes, but is not limited to, `Decimal_Number`, `Letter`, + `Math_Symbol`, `Number` and `Punctuation`. +* **unicode-perl** - + Provide the data for supporting the Unicode-aware Perl character classes, + corresponding to `\w`, `\s` and `\d`. This is also necessary for using + Unicode-aware word boundary assertions. Note that if this feature is + disabled, the `\s` and `\d` character classes are still available if the + `unicode-bool` and `unicode-gencat` features are enabled, respectively. +* **unicode-script** - + Provide the data for + [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). + This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, + `Latin` and `Thai`. +* **unicode-segment** - + Provide the data necessary to provide the properties used to implement the + [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). + This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and + `\p{sb=ATerm}`. +* **unicode-word-boundary** - + Enables support for Unicode word boundaries, i.e., `\b`, in regexes. When + this and `unicode-perl` are enabled, then data tables from `regex-syntax` are + used to implement Unicode word boundaries. However, if `regex-syntax` isn't + enabled as a dependency then one can still enable this feature. It will + cause `regex-automata` to bundle its own data table that would otherwise be + redundant with `regex-syntax`'s table. + +### Regex engine features + +* **syntax** - Enables a dependency on `regex-syntax`. This makes APIs +for building regex engines from pattern strings available. Without the +`regex-syntax` dependency, the only way to build a regex engine is generally +to deserialize a previously built DFA or to hand assemble an NFA using its +[builder API](nfa::thompson::Builder). Once you have an NFA, you can build any +of the regex engines in this crate. The `syntax` feature also enables `alloc`. +* **meta** - Enables the meta regex engine. This also enables the `syntax` and +`nfa-pikevm` features, as both are the minimal requirements needed. The meta +regex engine benefits from enabling any of the other regex engines and will +use them automatically when appropriate. +* **nfa** - Enables all NFA related features below. + * **nfa-thompson** - Enables the Thompson NFA APIs. This enables `alloc`. + * **nfa-pikevm** - Enables the PikeVM regex engine. This enables + `nfa-thompson`. + * **nfa-backtrack** - Enables the bounded backtracker regex engine. This + enables `nfa-thompson`. +* **dfa** - Enables all DFA related features below. + * **dfa-build** - Enables APIs for determinizing DFAs from NFAs. This + enables `nfa-thompson` and `dfa-search`. + * **dfa-search** - Enables APIs for searching with DFAs. + * **dfa-onepass** - Enables the one-pass DFA API. This enables + `nfa-thompson`. +* **hybrid** - Enables the hybrid NFA/DFA or "lazy DFA" regex engine. This +enables `alloc` and `nfa-thompson`. + */ // We are no_std. @@ -101,6 +627,9 @@ extern crate std; #[cfg(feature = "alloc")] extern crate alloc; +#[cfg(doctest)] +doc_comment::doctest!("../README.md"); + #[doc(inline)] pub use crate::util::primitives::PatternID; pub use crate::util::search::*; diff --git a/regex-automata/src/meta/limited.rs b/regex-automata/src/meta/limited.rs index bf3352f1dd..005878acdb 100644 --- a/regex-automata/src/meta/limited.rs +++ b/regex-automata/src/meta/limited.rs @@ -50,10 +50,7 @@ pub(crate) fn dfa_try_search_half_rev( use crate::dfa::Automaton; let mut mat = None; - let mut sid = match dfa.start_state_reverse(input)? { - None => return Ok(None), - Some(sid) => sid, - }; + let mut sid = dfa.start_state_reverse(input)?; if input.start() == input.end() { dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?; return Ok(mat); @@ -103,10 +100,7 @@ pub(crate) fn hybrid_try_search_half_rev( min_start: usize, ) -> Result, RetryError> { let mut mat = None; - let mut sid = match dfa.start_state_reverse(cache, input)? { - None => return Ok(None), - Some(sid) => sid, - }; + let mut sid = dfa.start_state_reverse(cache, input)?; if input.start() == input.end() { hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; return Ok(mat); diff --git a/regex-automata/src/meta/regex.rs b/regex-automata/src/meta/regex.rs index 5e6dd22850..cc6ac78cb8 100644 --- a/regex-automata/src/meta/regex.rs +++ b/regex-automata/src/meta/regex.rs @@ -159,10 +159,10 @@ type CachePoolFn = /// /// # Example: anchored search /// -/// This example shows how use [`Input::anchored`] to run an anchored search, -/// even when the regex pattern itself isn't anchored. An anchored search -/// guarantees that if a match is found, then the start offset of the match -/// corresponds to the offset at which the search was started. +/// This example shows how to use [`Input::anchored`] to run an anchored +/// search, even when the regex pattern itself isn't anchored. An anchored +/// search guarantees that if a match is found, then the start offset of the +/// match corresponds to the offset at which the search was started. /// /// ``` /// use regex_automata::{meta::Regex, Anchored, Input, Match}; @@ -732,9 +732,9 @@ impl Regex { /// # Ok::<(), Box>(()) /// ``` /// - /// When the empty string is used as a regex, it splits every at every - /// valid UTF-8 boundary by default (which includes the beginning and - /// end of the haystack): + /// When the empty string is used as a regex, it splits at every valid + /// UTF-8 boundary by default (which includes the beginning and end of the + /// haystack): /// /// ``` /// use regex_automata::meta::Regex; @@ -827,11 +827,10 @@ impl Regex { /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"\W+").unwrap(); - /// let hay = "a b \t c\td e"; /// let hay = "Hey! How are you?"; /// let fields: Vec<&str> = /// re.splitn(hay, 3).map(|span| &hay[span]).collect(); - /// assert_eq!(fields, vec!("Hey", "How", "are you?")); + /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); /// /// # Ok::<(), Box>(()) /// ``` @@ -1815,6 +1814,57 @@ impl Regex { self.imp.info.config() } + /// Returns true if this regex has a high chance of being "accelerated." + /// + /// The precise meaning of "accelerated" is specifically left unspecified, + /// but the general meaning is that the search is a high likelihood of + /// running faster than than a character-at-a-time loop inside a standard + /// regex engine. + /// + /// When a regex is accelerated, it is only a *probabilistic* claim. That + /// is, just because the regex is believed to be accelerated, that doesn't + /// mean it will definitely execute searches very fast. Similarly, if a + /// regex is *not* accelerated, that is also a probabilistic claim. That + /// is, a regex for which `is_accelerated` returns `false` could still run + /// searches more quickly than a regex for which `is_accelerated` returns + /// `true`. + /// + /// Whether a regex is marked as accelerated or not is dependent on + /// implementations details that may change in a semver compatible release. + /// That is, a regex that is accelerated in a `x.y.1` release might not be + /// accelerated in a `x.y.2` release. + /// + /// Basically, the value of acceleration boils down to a hedge: a hodge + /// podge of internal heuristics combine to make a probabilistic guess + /// that this regex search may run "fast." The value in knowing this from + /// a caller's perspective is that it may act as a signal that no further + /// work should be done to accelerate a search. For example, a grep-like + /// tool might try to do some extra work extracting literals from a regex + /// to create its own heuristic acceleration strategies. But it might + /// choose to defer to this crate's acceleration strategy if one exists. + /// This routine permits querying whether such a strategy is active for a + /// particular regex. + /// + /// # Example + /// + /// ``` + /// use regex_automata::meta::Regex; + /// + /// // A simple literal is very likely to be accelerated. + /// let re = Regex::new(r"foo")?; + /// assert!(re.is_accelerated()); + /// + /// // A regex with no literals is likely to not be accelerated. + /// let re = Regex::new(r"\w")?; + /// assert!(!re.is_accelerated()); + /// + /// # Ok::<(), Box>(()) + /// ``` + #[inline] + pub fn is_accelerated(&self) -> bool { + self.imp.strat.is_accelerated() + } + /// Return the total approximate heap memory, in bytes, used by this `Regex`. /// /// Note that currently, there is no high level configuration for setting @@ -2819,6 +2869,12 @@ impl Config { /// /// By default, `\n` is the line terminator. /// + /// **Warning**: This does not change the behavior of `.`. To do that, + /// you'll need to configure the syntax option + /// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator) + /// in addition to this. Otherwise, `.` will continue to match any + /// character other than `\n`. + /// /// # Example /// /// ``` diff --git a/regex-automata/src/meta/stopat.rs b/regex-automata/src/meta/stopat.rs index 23b916aa09..e8d716689c 100644 --- a/regex-automata/src/meta/stopat.rs +++ b/regex-automata/src/meta/stopat.rs @@ -56,10 +56,7 @@ pub(crate) fn dfa_try_search_half_fwd( use crate::dfa::{accel, Automaton}; let mut mat = None; - let mut sid = match dfa.start_state_forward(input)? { - None => return Ok(Err(input.start())), - Some(sid) => sid, - }; + let mut sid = dfa.start_state_forward(input)?; let mut at = input.start(); while at < input.end() { sid = dfa.next_state(sid, input.haystack()[at]); @@ -109,10 +106,7 @@ pub(crate) fn hybrid_try_search_half_fwd( input: &Input<'_>, ) -> Result, RetryFailError> { let mut mat = None; - let mut sid = match dfa.start_state_forward(cache, input)? { - None => return Ok(Err(input.start())), - Some(sid) => sid, - }; + let mut sid = dfa.start_state_forward(cache, input)?; let mut at = input.start(); while at < input.end() { sid = dfa diff --git a/regex-automata/src/meta/strategy.rs b/regex-automata/src/meta/strategy.rs index 69028d1b05..2de2c385ec 100644 --- a/regex-automata/src/meta/strategy.rs +++ b/regex-automata/src/meta/strategy.rs @@ -23,6 +23,20 @@ use crate::{ }, }; +/// A trait that represents a single meta strategy. Its main utility is in +/// providing a way to do dynamic dispatch over a few choices. +/// +/// Why dynamic dispatch? I actually don't have a super compelling reason, and +/// importantly, I have not benchmarked it with the main alternative: an enum. +/// I went with dynamic dispatch initially because the regex engine search code +/// really can't be inlined into caller code in most cases because it's just +/// too big. In other words, it is already expected that every regex search +/// will entail at least the cost of a function call. +/// +/// I do wonder whether using enums would result in better codegen overall +/// though. It's a worthwhile experiment to try. Probably the most interesting +/// benchmark to run in such a case would be one with a high match count. That +/// is, a benchmark to test the overall latency of a search call. pub(super) trait Strategy: Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static { @@ -32,6 +46,8 @@ pub(super) trait Strategy: fn reset_cache(&self, cache: &mut Cache); + fn is_accelerated(&self) -> bool; + fn memory_usage(&self) -> usize; fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option; @@ -61,38 +77,18 @@ pub(super) fn new( info: &RegexInfo, hirs: &[&Hir], ) -> Result, BuildError> { - let kind = info.config().get_match_kind(); - let prefixes = crate::util::prefilter::prefixes(kind, hirs); - if let Some(pre) = Pre::from_prefixes(info, &prefixes) { - debug!( - "found that the regex can be broken down to a literal \ - search, avoiding the regex engine entirely", - ); - return Ok(pre); - } - // This now attempts another short-circuit of the regex engine: if we - // have a huge alternation of just plain literals, then we can just use - // Aho-Corasick for that and avoid the regex engine entirely. - if let Some(pre) = Pre::from_alternation_literals(info, hirs) { - debug!( - "found plain alternation of literals, \ - avoiding regex engine entirely and using Aho-Corasick" - ); - return Ok(pre); - } - // At this point, we're committed to a regex engine of some kind. So pull // out a prefilter if we can, which will feed to each of the constituent // regex engines. let pre = if info.is_always_anchored_start() { // PERF: I'm not sure we necessarily want to do this... We may want to - // run a prefilter for quickly rejecting in some cases. The problem is - // that anchored searches overlap quite a bit with the use case of - // "run a regex on every line to extra data." In that case, the regex - // always matches, so running a prefilter doesn't really help us there. - // The main place where a prefilter helps in an anchored search is if - // the anchored search is not expected to match frequently. That is, - // the prefilter gives us a way to possibly reject a haystack very + // run a prefilter for quickly rejecting in some cases. The problem + // is that anchored searches overlap quite a bit with the use case + // of "run a regex on every line to extract data." In that case, the + // regex always matches, so running a prefilter doesn't really help us + // there. The main place where a prefilter helps in an anchored search + // is if the anchored search is not expected to match frequently. That + // is, the prefilter gives us a way to possibly reject a haystack very // quickly. // // Maybe we should do use a prefilter, but only for longer haystacks? @@ -102,15 +98,44 @@ pub(super) fn new( // disabling a prefilter based on haystack length. That would probably // need to be a new 'Input' option. (Interestingly, an 'Input' used to // carry a 'Prefilter' with it, but I moved away from that.) - debug!("discarding prefixes (if any) since regex is anchored"); + debug!("skipping literal extraction since regex is anchored"); None } else if let Some(pre) = info.config().get_prefilter() { debug!( - "discarding extracted prefixes (if any) \ - since the caller provided a prefilter" + "skipping literal extraction since the caller provided a prefilter" ); Some(pre.clone()) } else if info.config().get_auto_prefilter() { + let kind = info.config().get_match_kind(); + let prefixes = crate::util::prefilter::prefixes(kind, hirs); + // If we can build a full `Strategy` from just the extracted prefixes, + // then we can short-circuit and avoid building a regex engine at all. + if let Some(pre) = Pre::from_prefixes(info, &prefixes) { + debug!( + "found that the regex can be broken down to a literal \ + search, avoiding the regex engine entirely", + ); + return Ok(pre); + } + // This now attempts another short-circuit of the regex engine: if we + // have a huge alternation of just plain literals, then we can just use + // Aho-Corasick for that and avoid the regex engine entirely. + // + // You might think this case would just be handled by + // `Pre::from_prefixes`, but that technique relies on heuristic literal + // extraction from the corresponding `Hir`. That works, but part of + // heuristics limit the size and number of literals returned. This case + // will specifically handle patterns with very large alternations. + // + // One wonders if we should just roll this our heuristic literal + // extraction, and then I think this case could disappear entirely. + if let Some(pre) = Pre::from_alternation_literals(info, hirs) { + debug!( + "found plain alternation of literals, \ + avoiding regex engine entirely and using Aho-Corasick" + ); + return Ok(pre); + } prefixes.literals().and_then(|strings| { debug!( "creating prefilter from {} literals: {:?}", @@ -120,7 +145,7 @@ pub(super) fn new( Prefilter::new(kind, strings) }) } else { - debug!("discarding prefixes (if any) since prefilters were disabled"); + debug!("skipping literal extraction since prefilters were disabled"); None }; let mut core = Core::new(info.clone(), pre.clone(), hirs)?; @@ -343,6 +368,10 @@ impl Strategy for Pre

{ fn reset_cache(&self, _cache: &mut Cache) {} + fn is_accelerated(&self) -> bool { + self.pre.is_fast() + } + fn memory_usage(&self) -> usize { self.pre.memory_usage() } @@ -622,6 +651,10 @@ impl Strategy for Core { cache.hybrid.reset(&self.hybrid); } + fn is_accelerated(&self) -> bool { + self.pre.as_ref().map_or(false, |pre| pre.is_fast()) + } + fn memory_usage(&self) -> usize { self.info.memory_usage() + self.pre.as_ref().map_or(0, |pre| pre.memory_usage()) @@ -884,6 +917,13 @@ impl Strategy for ReverseAnchored { self.core.reset_cache(cache); } + fn is_accelerated(&self) -> bool { + // Since this is anchored at the end, a reverse anchored search is + // almost certainly guaranteed to result in a much faster search than + // a standard forward search. + true + } + fn memory_usage(&self) -> usize { self.core.memory_usage() } @@ -1161,6 +1201,10 @@ impl Strategy for ReverseSuffix { self.core.reset_cache(cache); } + fn is_accelerated(&self) -> bool { + self.pre.is_fast() + } + fn memory_usage(&self) -> usize { self.core.memory_usage() + self.pre.memory_usage() } @@ -1578,6 +1622,10 @@ impl Strategy for ReverseInner { cache.revhybrid.reset(&self.hybrid); } + fn is_accelerated(&self) -> bool { + self.preinner.is_fast() + } + fn memory_usage(&self) -> usize { self.core.memory_usage() + self.preinner.memory_usage() diff --git a/regex-automata/src/nfa/thompson/backtrack.rs b/regex-automata/src/nfa/thompson/backtrack.rs index e95708de16..4358652b89 100644 --- a/regex-automata/src/nfa/thompson/backtrack.rs +++ b/regex-automata/src/nfa/thompson/backtrack.rs @@ -129,7 +129,7 @@ impl Config { /// The visited capacity represents the amount of heap memory (in bytes) to /// allocate toward tracking which parts of the backtracking search have /// been done before. The heap memory needed for any particular search is - /// proportional to `haystack.len() * nfa.states().len()`, whichc an be + /// proportional to `haystack.len() * nfa.states().len()`, which an be /// quite large. Therefore, the bounded backtracker is typically only able /// to run on shorter haystacks. /// diff --git a/regex-automata/src/nfa/thompson/nfa.rs b/regex-automata/src/nfa/thompson/nfa.rs index bb188ed2e1..6e46b04df9 100644 --- a/regex-automata/src/nfa/thompson/nfa.rs +++ b/regex-automata/src/nfa/thompson/nfa.rs @@ -1100,6 +1100,11 @@ impl NFA { self.0.look_set_prefix_any } + // FIXME: The `look_set_prefix_all` computation was not correct, and it + // seemed a little tricky to fix it. Since I wasn't actually using it for + // anything, I just decided to remove it in the run up to the regex 1.9 + // release. If you need this, please file an issue. + /* /// Returns the intersection of all prefix look-around assertions for every /// pattern in this NFA. When the returned set is empty, it implies at /// least one of the patterns does not require moving through a conditional @@ -1127,7 +1132,7 @@ impl NFA { /// // When multiple patterns are present, since this returns the /// // intersection, it will only include assertions present in every /// // prefix, and only the prefix. - /// let nfa = NFA::new_many(&["^a$", "^b$", "^ab$", "^c$"])?; + /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?; /// assert!(nfa.look_set_prefix_all().contains(Look::Start)); /// assert!(!nfa.look_set_prefix_all().contains(Look::End)); /// @@ -1137,6 +1142,7 @@ impl NFA { pub fn look_set_prefix_all(&self) -> LookSet { self.0.look_set_prefix_all } + */ /// Returns the memory usage, in bytes, of this NFA. /// @@ -1249,9 +1255,11 @@ pub(super) struct Inner { /// The union of all look-around assertions that occur as a zero-length /// prefix for any of the patterns in this NFA. look_set_prefix_any: LookSet, + /* /// The intersection of all look-around assertions that occur as a /// zero-length prefix for any of the patterns in this NFA. look_set_prefix_all: LookSet, + */ /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this @@ -1271,7 +1279,8 @@ impl Inner { for &start_id in self.start_pattern.iter() { stack.push(start_id); seen.clear(); - let mut prefix = LookSet::empty(); + // let mut prefix_all = LookSet::full(); + let mut prefix_any = LookSet::empty(); while let Some(sid) = stack.pop() { if !seen.insert(sid) { continue; @@ -1307,7 +1316,7 @@ impl Inner { } State::Match { .. } => self.has_empty = true, State::Look { look, next } => { - prefix = prefix.insert(look); + prefix_any = prefix_any.insert(look); stack.push(next); } State::Union { ref alternates } => { @@ -1326,8 +1335,8 @@ impl Inner { } } } - self.look_set_prefix_any = self.look_set_prefix_any.union(prefix); - self.look_set_prefix_all = self.look_set_prefix_all.union(prefix); + self.look_set_prefix_any = + self.look_set_prefix_any.union(prefix_any); } NFA(Arc::new(self)) } diff --git a/regex-automata/src/nfa/thompson/range_trie.rs b/regex-automata/src/nfa/thompson/range_trie.rs index 1684e59a74..2522e7fe01 100644 --- a/regex-automata/src/nfa/thompson/range_trie.rs +++ b/regex-automata/src/nfa/thompson/range_trie.rs @@ -105,8 +105,8 @@ overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no simple way to apply Daciuk's algorithm. And thus, the range trie was born. The range trie's only purpose is to take -sequences of byte ranges like the ones above, collect them into a trie and -then spit them in a sorted fashion with no overlapping ranges. For example, +sequences of byte ranges like the ones above, collect them into a trie and then +spit them out in a sorted fashion with no overlapping ranges. For example, 0x00-0x10FFFF gets translated to: [0-7F] @@ -130,42 +130,31 @@ We've thus satisfied our requirements for running Daciuk's algorithm. All sequences of ranges are sorted, and any corresponding ranges are either exactly equivalent or non-overlapping. -In effect, a range trie is building a DFA from a sequence of arbitrary -byte ranges. But it uses an algoritm custom tailored to its input, so it -is not as costly as traditional DFA construction. While it is still quite -a bit more costly than the forward's case (which only needs Daciuk's -algorithm), it winds up saving a substantial amount of time if one is doing -a full DFA powerset construction later by virtue of producing a much much -smaller NFA. +In effect, a range trie is building a DFA from a sequence of arbitrary byte +ranges. But it uses an algoritm custom tailored to its input, so it is not as +costly as traditional DFA construction. While it is still quite a bit more +costly than the forward case (which only needs Daciuk's algorithm), it winds +up saving a substantial amount of time if one is doing a full DFA powerset +construction later by virtue of producing a much much smaller NFA. [1] - https://blog.burntsushi.net/transducers/ [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 */ -use core::{ - cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive, u32, -}; +use core::{cell::RefCell, convert::TryFrom, fmt, mem, ops::RangeInclusive}; use alloc::{format, string::String, vec, vec::Vec}; use regex_syntax::utf8::Utf8Range; -/// A smaller state ID means more effective use of the CPU cache and less -/// time spent copying. The implementation below will panic if the state ID -/// space is exhausted, but in order for that to happen, the range trie itself -/// would use well over 100GB of memory. Moreover, it's likely impossible -/// for the state ID space to get that big. In fact, it's likely that even a -/// u16 would be good enough here. But it's not quite clear how to prove this. -/// -/// TODO: We should switch to using crate::util::primitives::StateID. -type StateID = u32; +use crate::util::primitives::StateID; /// There is only one final state in this trie. Every sequence of byte ranges /// added shares the same final state. -const FINAL: StateID = 0; +const FINAL: StateID = StateID::ZERO; /// The root state of the trie. -const ROOT: StateID = 1; +const ROOT: StateID = StateID::new_unchecked(1); /// A range trie represents an ordered set of sequences of bytes. /// @@ -550,12 +539,12 @@ impl RangeTrie { /// Return an immutable borrow for the state with the given ID. fn state(&self, id: StateID) -> &State { - &self.states[usize::try_from(id).unwrap()] + &self.states[id] } /// Return a mutable borrow for the state with the given ID. fn state_mut(&mut self, id: StateID) -> &mut State { - &mut self.states[usize::try_from(id).unwrap()] + &mut self.states[id] } } @@ -877,11 +866,9 @@ impl Split { impl fmt::Debug for RangeTrie { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - // OK since FINAL == 0. - let ufinal = usize::try_from(FINAL).unwrap(); writeln!(f, "")?; for (i, state) in self.states.iter().enumerate() { - let status = if i == ufinal { '*' } else { ' ' }; + let status = if i == FINAL.as_usize() { '*' } else { ' ' }; writeln!(f, "{}{:06}: {:?}", status, i, state)?; } Ok(()) @@ -903,12 +890,19 @@ impl fmt::Debug for State { impl fmt::Debug for Transition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.range.start == self.range.end { - write!(f, "{:02X} => {:02X}", self.range.start, self.next_id) + write!( + f, + "{:02X} => {:02X}", + self.range.start, + self.next_id.as_usize(), + ) } else { write!( f, "{:02X}-{:02X} => {:02X}", - self.range.start, self.range.end, self.next_id + self.range.start, + self.range.end, + self.next_id.as_usize(), ) } } diff --git a/regex-automata/src/util/captures.rs b/regex-automata/src/util/captures.rs index d4f718b436..30bcced498 100644 --- a/regex-automata/src/util/captures.rs +++ b/regex-automata/src/util/captures.rs @@ -32,7 +32,7 @@ directly, but for example, if you've compiled an Thompson NFA, then you can use underlying `GroupInfo`. */ -use alloc::{format, string::String, sync::Arc, vec, vec::Vec}; +use alloc::{string::String, sync::Arc, vec, vec::Vec}; use crate::util::{ interpolate, @@ -1219,19 +1219,26 @@ struct CapturesDebugMap<'a> { impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + struct Key<'a>(usize, Option<&'a str>); + + impl<'a> core::fmt::Debug for Key<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.0)?; + if let Some(name) = self.1 { + write!(f, "/{:?}", name)?; + } + Ok(()) + } + } + let mut map = f.debug_map(); let names = self.caps.group_info().pattern_names(self.pid); for (group_index, maybe_name) in names.enumerate() { - let span = self.caps.get_group(group_index); - let debug_span: &dyn core::fmt::Debug = match span { - None => &None::<()>, - Some(ref span) => span, + let key = Key(group_index, maybe_name); + match self.caps.get_group(group_index) { + None => map.entry(&key, &None::<()>), + Some(span) => map.entry(&key, &span), }; - if let Some(name) = maybe_name { - map.entry(&format!("{}/{}", group_index, name), debug_span); - } else { - map.entry(&group_index, debug_span); - } } map.finish() } diff --git a/regex-automata/src/util/escape.rs b/regex-automata/src/util/escape.rs index 52e7227909..7f6aa15f5d 100644 --- a/regex-automata/src/util/escape.rs +++ b/regex-automata/src/util/escape.rs @@ -1,18 +1,22 @@ /*! -This module defines a few convenience routines for escaping raw bytes. Namely, -since this crate tends to deal with `&[u8]` everywhere and the default +Provides convenience routines for escaping raw bytes. + +Since this crate tends to deal with `&[u8]` everywhere and the default `Debug` implementation just shows decimal integers, it makes debugging those -representations quite difficult. So this module provides types that show -`&[u8]` as if it were a string, with invalid UTF-8 escaped into its hex +representations quite difficult. This module provides types that show `&[u8]` +as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex representation. */ use crate::util::utf8; -/// A type that wraps a single byte with a convenient fmt::Debug impl that -/// escapes the byte. +/// Provides a convenient `Debug` implementation for a `u8`. +/// +/// The `Debug` impl treats the byte as an ASCII, and emits a human readable +/// representation of it. If the byte isn't ASCII, then it's emitted as a hex +/// escape sequence. #[derive(Clone, Copy)] -pub(crate) struct DebugByte(pub(crate) u8); +pub struct DebugByte(pub u8); impl core::fmt::Debug for DebugByte { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { @@ -37,11 +41,12 @@ impl core::fmt::Debug for DebugByte { } } -/// A type that provides a human readable debug impl for arbitrary bytes. +/// Provides a convenient `Debug` implementation for `&[u8]`. /// /// This generally works best when the bytes are presumed to be mostly UTF-8, -/// but will work for anything. -pub(crate) struct DebugHaystack<'a>(pub(crate) &'a [u8]); +/// but will work for anything. For any bytes that aren't UTF-8, they are +/// emitted as hex escape sequences. +pub struct DebugHaystack<'a>(pub &'a [u8]); impl<'a> core::fmt::Debug for DebugHaystack<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { diff --git a/regex-automata/src/util/interpolate.rs b/regex-automata/src/util/interpolate.rs index a74d361e96..f274629df4 100644 --- a/regex-automata/src/util/interpolate.rs +++ b/regex-automata/src/util/interpolate.rs @@ -281,8 +281,8 @@ fn find_cap_ref(replacement: &[u8]) -> Option> { let cap = core::str::from_utf8(&rep[i..cap_end]) .expect("valid UTF-8 capture name"); Some(CaptureRef { - cap: match cap.parse::() { - Ok(i) => Ref::Number(i as usize), + cap: match cap.parse::() { + Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: cap_end, @@ -310,8 +310,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { Ok(cap) => cap, }; Some(CaptureRef { - cap: match cap.parse::() { - Ok(i) => Ref::Number(i as usize), + cap: match cap.parse::() { + Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: i + 1, diff --git a/regex-automata/src/util/iter.rs b/regex-automata/src/util/iter.rs index 095d521614..4fda8dc4cb 100644 --- a/regex-automata/src/util/iter.rs +++ b/regex-automata/src/util/iter.rs @@ -712,6 +712,15 @@ impl<'h, F> TryHalfMatchesIter<'h, F> { pub fn infallible(self) -> HalfMatchesIter<'h, F> { HalfMatchesIter(self) } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } } impl<'h, F> Iterator for TryHalfMatchesIter<'h, F> @@ -757,6 +766,17 @@ impl<'h, F> core::fmt::Debug for TryHalfMatchesIter<'h, F> { #[derive(Debug)] pub struct HalfMatchesIter<'h, F>(TryHalfMatchesIter<'h, F>); +impl<'h, F> HalfMatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + impl<'h, F> Iterator for HalfMatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, @@ -808,6 +828,15 @@ impl<'h, F> TryMatchesIter<'h, F> { pub fn infallible(self) -> MatchesIter<'h, F> { MatchesIter(self) } + + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.it.input() + } } impl<'h, F> Iterator for TryMatchesIter<'h, F> @@ -852,6 +881,17 @@ impl<'h, F> core::fmt::Debug for TryMatchesIter<'h, F> { #[derive(Debug)] pub struct MatchesIter<'h, F>(TryMatchesIter<'h, F>); +impl<'h, F> MatchesIter<'h, F> { + /// Returns the current `Input` used by this iterator. + /// + /// The `Input` returned is generally equivalent to the one used to + /// construct this iterator, but its start position may be different to + /// reflect the start of the next search to be executed. + pub fn input<'i>(&'i self) -> &'i Input<'h> { + self.0.it.input() + } +} + impl<'h, F> Iterator for MatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, diff --git a/regex-automata/src/util/lazy.rs b/regex-automata/src/util/lazy.rs index 8b30915ab0..b9f013c88c 100644 --- a/regex-automata/src/util/lazy.rs +++ b/regex-automata/src/util/lazy.rs @@ -30,9 +30,9 @@ use core::fmt; /// # Warning: may use a spin lock /// /// When this crate is compiled _without_ the `alloc` feature, then this type -/// may used a spin lock internally. This can have subtle effects that may be -/// desirable. See [Spinlocks Considered Harmful][spinharm] for a more thorough -/// treatment of this topic. +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. /// /// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html /// diff --git a/regex-automata/src/util/mod.rs b/regex-automata/src/util/mod.rs index 71d38192c7..bb739df1df 100644 --- a/regex-automata/src/util/mod.rs +++ b/regex-automata/src/util/mod.rs @@ -12,6 +12,7 @@ walking its state graph directly. * `captures` contains APIs for dealing with capture group matches and their mapping to "slots" used inside an NFA graph. This is also where you can find iterators over capture group names. +* `escape` contains types for pretty-printing raw byte slices as strings. * `iter` contains API helpers for writing regex iterators. * `lazy` contains a no-std and no-alloc variant of `lazy_static!` and `once_cell`. @@ -29,6 +30,7 @@ with the `regex-syntax` crate. pub mod alphabet; #[cfg(feature = "alloc")] pub mod captures; +pub mod escape; #[cfg(feature = "alloc")] pub mod interpolate; pub mod iter; @@ -45,7 +47,6 @@ pub mod wire; #[cfg(any(feature = "dfa-build", feature = "hybrid"))] pub(crate) mod determinize; pub(crate) mod empty; -pub(crate) mod escape; pub(crate) mod int; pub(crate) mod memchr; pub(crate) mod search; diff --git a/regex-automata/src/util/pool.rs b/regex-automata/src/util/pool.rs index 50dbc51411..7f4a1c21e2 100644 --- a/regex-automata/src/util/pool.rs +++ b/regex-automata/src/util/pool.rs @@ -123,9 +123,9 @@ being quite expensive. /// # Warning: may use a spin lock /// /// When this crate is compiled _without_ the `std` feature, then this type -/// may used a spin lock internally. This can have subtle effects that may be -/// desirable. See [Spinlocks Considered Harmful][spinharm] for a more thorough -/// treatment of this topic. +/// may used a spin lock internally. This can have subtle effects that may +/// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more +/// thorough treatment of this topic. /// /// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html /// diff --git a/regex-automata/src/util/prefilter/mod.rs b/regex-automata/src/util/prefilter/mod.rs index 29b1d7fced..ea3eb73d8c 100644 --- a/regex-automata/src/util/prefilter/mod.rs +++ b/regex-automata/src/util/prefilter/mod.rs @@ -577,6 +577,11 @@ impl Choice { debug!("prefilter building failed: literals match empty string"); return None; } + // BREADCRUMBS: Perhaps the literal optimizer should special case + // sequences of length two or three if the leading bytes of each are + // "rare"? Or perhaps, if there are two or three total possible leading + // bytes, regardless of the number of literals, and all are rare... + // Then well, perhaps we should use memchr2 or memchr3 in those cases? if let Some(pre) = Memchr::new(kind, needles) { debug!("prefilter built: memchr"); return Some(Choice::Memchr(pre)); diff --git a/regex-automata/src/util/search.rs b/regex-automata/src/util/search.rs index e9903d3436..70affcb7fe 100644 --- a/regex-automata/src/util/search.rs +++ b/regex-automata/src/util/search.rs @@ -1770,7 +1770,7 @@ impl Default for MatchKind { /// There are a couple other ways a search /// can fail. For example, when using the /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) -/// with a haystack that is too long, or a trying to run an unanchored search +/// with a haystack that is too long, or trying to run an unanchored search /// with a [one-pass DFA](crate::dfa::onepass). #[derive(Clone, Debug, Eq, PartialEq)] pub struct MatchError( diff --git a/regex-automata/src/util/syntax.rs b/regex-automata/src/util/syntax.rs index 534ee9313e..9260ea20cb 100644 --- a/regex-automata/src/util/syntax.rs +++ b/regex-automata/src/util/syntax.rs @@ -147,6 +147,7 @@ pub struct Config { multi_line: bool, dot_matches_new_line: bool, crlf: bool, + line_terminator: u8, swap_greed: bool, ignore_whitespace: bool, unicode: bool, @@ -164,6 +165,7 @@ impl Config { multi_line: false, dot_matches_new_line: false, crlf: false, + line_terminator: b'\n', swap_greed: false, ignore_whitespace: false, unicode: true, @@ -239,6 +241,31 @@ impl Config { self } + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(mut self, byte: u8) -> Config { + self.line_terminator = byte; + self + } + /// Enable or disable the "swap greed" flag by default. /// /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` @@ -377,6 +404,11 @@ impl Config { self.crlf } + /// Returns the line terminator in this syntax configuration. + pub fn get_line_terminator(&self) -> u8 { + self.line_terminator + } + /// Returns whether "swap greed" mode is enabled. pub fn get_swap_greed(&self) -> bool { self.swap_greed @@ -410,6 +442,7 @@ impl Config { .multi_line(self.multi_line) .dot_matches_new_line(self.dot_matches_new_line) .crlf(self.crlf) + .line_terminator(self.line_terminator) .swap_greed(self.swap_greed) .ignore_whitespace(self.ignore_whitespace) .utf8(self.utf8) @@ -436,6 +469,7 @@ impl Config { .multi_line(self.multi_line) .crlf(self.crlf) .dot_matches_new_line(self.dot_matches_new_line) + .line_terminator(self.line_terminator) .swap_greed(self.swap_greed) .utf8(self.utf8); } diff --git a/regex-automata/tests/dfa/onepass/suite.rs b/regex-automata/tests/dfa/onepass/suite.rs index bed86270d9..20bd6965c8 100644 --- a/regex-automata/tests/dfa/onepass/suite.rs +++ b/regex-automata/tests/dfa/onepass/suite.rs @@ -193,4 +193,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config { .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) + .line_terminator(test.line_terminator()) } diff --git a/regex-automata/tests/dfa/suite.rs b/regex-automata/tests/dfa/suite.rs index 7b8f41c719..f3445e02a4 100644 --- a/regex-automata/tests/dfa/suite.rs +++ b/regex-automata/tests/dfa/suite.rs @@ -391,6 +391,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config { .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) + .line_terminator(test.line_terminator()) } /// Execute an overlapping search, and for each match found, also find its diff --git a/regex-automata/tests/hybrid/suite.rs b/regex-automata/tests/hybrid/suite.rs index 962fd87143..4aaca66984 100644 --- a/regex-automata/tests/hybrid/suite.rs +++ b/regex-automata/tests/hybrid/suite.rs @@ -281,6 +281,7 @@ fn config_syntax(test: &RegexTest) -> syntax::Config { .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) + .line_terminator(test.line_terminator()) } /// Execute an overlapping search, and for each match found, also find its diff --git a/regex-automata/tests/meta/suite.rs b/regex-automata/tests/meta/suite.rs index 9ee2b33ddf..20f97b4bb9 100644 --- a/regex-automata/tests/meta/suite.rs +++ b/regex-automata/tests/meta/suite.rs @@ -196,4 +196,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config { .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) + .line_terminator(test.line_terminator()) } diff --git a/regex-automata/tests/nfa/thompson/backtrack/suite.rs b/regex-automata/tests/nfa/thompson/backtrack/suite.rs index 1476b7aaf9..bce0eef408 100644 --- a/regex-automata/tests/nfa/thompson/backtrack/suite.rs +++ b/regex-automata/tests/nfa/thompson/backtrack/suite.rs @@ -209,4 +209,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config { .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) + .line_terminator(test.line_terminator()) } diff --git a/regex-automata/tests/nfa/thompson/pikevm/suite.rs b/regex-automata/tests/nfa/thompson/pikevm/suite.rs index a3f2457f76..d32842a156 100644 --- a/regex-automata/tests/nfa/thompson/pikevm/suite.rs +++ b/regex-automata/tests/nfa/thompson/pikevm/suite.rs @@ -158,4 +158,5 @@ fn config_syntax(test: &RegexTest) -> syntax::Config { .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) + .line_terminator(test.line_terminator()) } diff --git a/regex-capi/src/rure.rs b/regex-capi/src/rure.rs index d2e1539ed2..9e17668e26 100644 --- a/regex-capi/src/rure.rs +++ b/regex-capi/src/rure.rs @@ -41,7 +41,7 @@ pub struct rure_match { pub end: size_t, } -pub struct Captures(bytes::Locations); +pub struct Captures(bytes::CaptureLocations); pub struct Iter { re: *const Regex, @@ -198,7 +198,7 @@ ffi_fn! { let re = unsafe { &*re }; let haystack = unsafe { slice::from_raw_parts(haystack, len) }; let slots = unsafe { &mut (*captures).0 }; - re.read_captures_at(slots, haystack, start).is_some() + re.captures_read_at(slots, haystack, start).is_some() } } @@ -375,7 +375,7 @@ ffi_fn! { if it.last_end > text.len() { return false; } - let (s, e) = match re.read_captures_at(slots, text, it.last_end) { + let (s, e) = match re.captures_read_at(slots, text, it.last_end) { None => return false, Some(m) => (m.start(), m.end()), }; @@ -400,7 +400,7 @@ ffi_fn! { ffi_fn! { fn rure_captures_new(re: *const Regex) -> *mut Captures { let re = unsafe { &*re }; - let captures = Captures(re.locations()); + let captures = Captures(re.capture_locations()); Box::into_raw(Box::new(captures)) } } @@ -418,7 +418,7 @@ ffi_fn! { match_info: *mut rure_match, ) -> bool { let locs = unsafe { &(*captures).0 }; - match locs.pos(i) { + match locs.get(i) { Some((start, end)) => { if !match_info.is_null() { unsafe { @@ -562,7 +562,7 @@ ffi_fn! { for item in matches.iter_mut() { *item = false; } - re.read_matches_at(&mut matches, haystack, start) + re.matches_read_at(&mut matches, haystack, start) } } diff --git a/regex-cli/README.md b/regex-cli/README.md new file mode 100644 index 0000000000..36dc50e772 --- /dev/null +++ b/regex-cli/README.md @@ -0,0 +1,265 @@ +regex-cli +========= +This is a command line tool for interacting with the regex, regex-automata and +regex-syntax crates. It enables one to print debug representations of various +values, run searches, generate DFAs and deserialization code and perform +various regex development tasks such as generating tests. + +### Installation + +Currently `regex-cli` is not on crates.io and should be installed from this +git repository: + +``` +$ cargo install --git https://github.com/rust-lang/regex regex-cli +``` + + +### Example: print debug output + +The `regex-cli` command provides a way to print the debug output for most +of the principle types in the `regex-automata` crate. This can be useful for +debugging purposes when working on the `regex` project, or even if you just +want a better look at a regex object's internal representation. For example, +the following two commands compare and contrast the differences in the NFA for +`.` and `(?-u:.)`: + +``` +$ regex-cli debug thompson '.' --no-table + +thompson::NFA( +>000000: binary-union(2, 1) + 000001: \x00-\xFF => 0 +^000002: capture(pid=0, group=0, slot=0) => 10 + 000003: \x80-\xBF => 11 + 000004: \xA0-\xBF => 3 + 000005: \x80-\xBF => 3 + 000006: \x80-\x9F => 3 + 000007: \x90-\xBF => 5 + 000008: \x80-\xBF => 5 + 000009: \x80-\x8F => 5 + 000010: sparse(\x00-\t => 11, \x0B-\x7F => 11, \xC2-\xDF => 3, \xE0 => 4, \xE1-\xEC => 5, \xED => 6, \xEE-\xEF => 5, \xF0 => 7, \xF1-\xF3 => 8, \xF4 => 9) + 000011: capture(pid=0, group=0, slot=1) => 12 + 000012: MATCH(0) + +transition equivalence classes: ByteClasses(0 => [\x00-\t], 1 => [\n], 2 => [\x0B-\x7F], 3 => [\x80-\x8F], 4 => [\x90-\x9F], 5 => [\xA0-\xBF], 6 => [\xC0-\xC1], 7 => [\xC2-\xDF], 8 => [\xE0], 9 => [\xE1-\xEC], 10 => [\xED], 11 => [\xEE-\xEF], 12 => [\xF0], 13 => [\xF1-\xF3], 14 => [\xF4], 15 => [\xF5-\xFF], 16 => [EOI]) +) +``` + +And now for `(?-u:.)`: + +``` +$ regex-cli debug thompson -b '(?-u:.)' --no-table + +thompson::NFA( +>000000: binary-union(2, 1) + 000001: \x00-\xFF => 0 +^000002: capture(pid=0, group=0, slot=0) => 3 + 000003: sparse(\x00-\t => 4, \x0B-\xFF => 4) + 000004: capture(pid=0, group=0, slot=1) => 5 + 000005: MATCH(0) + +transition equivalence classes: ByteClasses(0 => [\x00-\t], 1 => [\n], 2 => [\x0B-\xFF], 3 => [EOI]) +) +``` + +To make things a bit more concise, we use `--no-table` to omit some extra +metadata about the size of the NFA and the time required to build it. + +In the second example, we also pass the `-b/--no-utf8-syntax` flag. Without +it, the command returns an error because patterns are compiled with default +settings. The default setting is to forbid any pattern that can possibly match +invalid UTF-8. Since `(?-u:.)` matches any byte except for `\n`, it can match +invalid UTF-8. Thus, you have to say, "I am explicitly okay with matching +invalid UTF-8." + + +### Example: execute a search + +This command shows how to run a search with multiple patterns with each +containing capture groups. The output shows the value of each matching group. + +``` +$ regex-cli find capture meta -p '(?m)^(?[[:word:]]+)="(?[^"]+)"$' -p $'(?m)^(?[[:word:]]+)=\'(?[^\']+)\'$' -y 'best_album="Blow Your Face Out"' + parse time: 81.541µs + translate time: 52.035µs +build meta time: 805.696µs + search time: 426.391µs + total matches: 1 +0:{ 0: 0..31/best_album="Blow\x20Your\x20Face\x20Out", 1/key: 0..10/best_album, 2/val: 12..30/Blow\x20Your\x20Face\x20Out } +``` + +In this case, `meta` refers to the regex engine. It can be a number of other +things, including `lite` for testing the `regex-lite` crate. Also, `capture` +refers to the kind of search. You can also just ask for the `match` which will +print the overall match and not the capture groups: + +``` +$ regex-cli find match meta -p '(?m)^(?[[:word:]]+)="(?[^"]+)"$' -p $'(?m)^(?[[:word:]]+)=\'(?[^\']+)\'$' -y 'best_album="Blow Your Face Out"' + parse time: 67.067µs + translate time: 40.005µs +build meta time: 586.163µs + search time: 291.633µs + total matches: 1 +0:0:31:best_album="Blow\x20Your\x20Face\x20Out" +``` + +Since not all regex engines support capture groups, using `match` will open up +the ability to test other regex engines such as `hybrid`. + +Finally, the `-p/--pattern` flag specifies a pattern and the `-y/--haystack` +flag provides a haystack to search as a command line argument. One can also omit +the `-y/--haystack` flag and provide a file path to search instead: + +``` +$ echo 'best_album="Blow Your Face Out"' > haystack +$ regex-cli find match hybrid -p '(?m)^(?[[:word:]]+)="(?[^"]+)"$' -p $'(?m)^(?[[:word:]]+)=\'(?[^\']+)\'$' haystack + parse time: 60.278µs + translate time: 43.832µs + compile forward nfa time: 462.148µs + compile reverse nfa time: 56.275µs +build forward hybrid time: 6.532µs +build reverse hybrid time: 4.089µs + build regex time: 4.899µs + cache creation time: 18.59µs + search time: 54.653µs + total matches: 1 +0:0:31:best_album="Blow\x20Your\x20Face\x20Out" +``` + + +### Example: serialize a DFA + +One particularly useful command in `regex-cli` is `regex-cli generate +serialize`. It takes care of generating and writing a fully compiled DFA to +a file, and then producing Rust code that deserializes it. The command line +provides oodles of options, including all options found in the `regex-automata` +crate for building the DFA in code. + +Let's walk through a complete end-to-end example. We assume `regex-cli` is +already installed per instructions above. Let's start with an empty binary +Rust project: + +``` +$ mkdir regex-dfa-test +$ cd regex-dfa-test +$ cargo init --bin +``` + +Now add a dependency on `regex-automata`. Technically, the only feature that +needs to be enabled for this example is `dfa-search`, but we include `std` as +well to get some conveniences like `std::error::Error` implementations and also +optimizations. But you can drop `std` and just use `alloc` or even drop `alloc` +too altogether if necessary. + +``` +$ cargo add regex-automata --features std,dfa-search +``` + +Now we can generate a DFA with `regex-cli`. This will create three files: the +little endian binary serialization of the DFA, the big endian version and a +simple Rust source file for lazily deserializing the DFA via a static into a +`regex_automata::util::lazy::Lazy`: + +``` +regex-cli generate serialize sparse dfa \ + --minimize \ + --shrink \ + --start-kind anchored \ + --rustfmt \ + --safe \ + SIMPLE_WORD_FWD \ + ./src/ \ + "\w" +``` + +We pass a number of flags here. There are even more available, and generally +speaking, there is at least one flag for each configuration knob available in +the library. This means that it should be possible to configure the DFA in any +way you might expect to be able to in the code. We can briefly explain the +flags we use here though: + +* `--minimize` applies a [DFA minimization] algorithm to try and shrink +the size of the DFA as much as possible. In some cases it can make a big +difference, but not all. Minimization can also be extremely expensive, but +given that this is an offline process and presumably done rarely, it's usually +a good trade off to make. +* `--shrink` uses heuristics to make the size of the NFA smaller in some cases. +This doesn't impact the size of the DFA, but it can make determinization (the +process of converting an NFA into a DFA) faster at the cost of making NFA +construction slower. This can make overall DFA generation time faster. +* `--start-kind anchored` says to build a DFA that only supports anchored +searches. (That is, every match must have a start offset equivalent to the +start of the search.) Without this, DFAs support both anchored and unanchored +searches, and that in turn can make them much bigger than they need to be if +you only need one or the other. +* `--rustfmt` will run `rustfmt` on the generated Rust code. +* `--safe` will use only safe code for deserializing the DFA. This may be +slower, but it is a one time cost. If you find that deserializing the DFA is +too slow, then dropping this option will use alternative APIs that may result +in undefined behavior if the given DFA is not valid. (Every DFA generated by +`regex-cli` is intended to be valid. So *not* using `--safe` should always be +correct, but it's up to you whether it's worth doing.) + +[DFA minimization]: https://en.wikipedia.org/wiki/DFA_minimization + +The final three positional arguments are as follows: + +* `SIMPLE_WORD_FWD` is the name of the variable in the Rust source code for +the DFA, and it is also used in generating the names of the files produced by +this command. +* `./src/` is the directory to write the files. +* `\w` is the regex pattern to build the DFA for. More than one may be given! + +Once the DFA is generated, you should see three new files in `./src/`: + +``` +$ ls -l src/ +total 32 +-rw-rw-r-- 1 andrew users 45 May 28 22:04 main.rs +-rw-rw-r-- 1 andrew users 11095 May 30 10:24 simple_word_fwd.bigendian.dfa +-rw-rw-r-- 1 andrew users 11095 May 30 10:24 simple_word_fwd.littleendian.dfa +-rw-rw-r-- 1 andrew users 711 May 30 10:24 simple_word_fwd.rs +``` + +At this point, you just need to add the appropriate `mod` definition in +`main.rs` and use the DFA: + +```rust +use regex_automata::{dfa::Automaton, Anchored, Input}; + +use crate::simple_word_fwd::SIMPLE_WORD_FWD as DFA; + +mod simple_word_fwd; + +fn main() { + let input = Input::new("ω").anchored(Anchored::Yes); + println!("is a word: {:?}", DFA.try_search_fwd(&input)); + + let input = Input::new("☃").anchored(Anchored::Yes); + println!("not a word: {:?}", DFA.try_search_fwd(&input)); +} +``` + +And now run the program: + +``` +$ cargo run + Compiling regex-dfa-test v0.1.0 (/home/andrew/tmp/regex-dfa-test) + Finished dev [unoptimized + debuginfo] target(s) in 0.17s + Running `target/debug/regex-dfa-test` +is a word: Ok(Some(HalfMatch { pattern: PatternID(0), offset: 2 })) +not a word: Ok(None) +``` + +There are a few other things worth mentioning: + +* The above generates a "sparse" DFA. This sacrifices search performance in +favor of (potentially much) smaller DFAs. One can also generate a "dense" DFA +to get faster searches but larger DFAs. +* Above, we generated a "dfa," but one can also generate a "regex." The +difference is that a DFA can only find the end of a match (or start of a match +if the DFA is reversed), where as a regex will generate two DFAs: one for +finding the end of a match and then another for finding the start. One can +generate two DFAs manually and stitch them together in the code, but generating +a `regex` will take care of this for you. diff --git a/regex-cli/args/meta.rs b/regex-cli/args/meta.rs index f5503dfab8..e5e7873f0f 100644 --- a/regex-cli/args/meta.rs +++ b/regex-cli/args/meta.rs @@ -13,6 +13,7 @@ use crate::args::{self, flags, Configurable, Usage}; #[derive(Debug, Default)] pub struct Config { meta: meta::Config, + build_from_patterns: bool, } impl Config { @@ -21,11 +22,28 @@ impl Config { Ok(self.meta.clone()) } - /// Build a lazy DFA from the NFA given. + /// Whether to build a meta regex directly from the pattern strings, or to + /// require the caller to build their own HIR first. /// - /// Building a lazy DFA is generally cheap. It only does a little bit of - /// work, but otherwise, the actual determinization process is carried out - /// on demand at search time. + /// i.e., Whether the caller should use `from_patterns` or `from_hirs`. + pub fn build_from_patterns(&self) -> bool { + self.build_from_patterns + } + + /// Build a meta regex from the pattern strings given. + pub fn from_patterns>( + &self, + syntax: &crate::args::syntax::Config, + patterns: &[P], + ) -> anyhow::Result { + meta::Builder::new() + .configure(self.meta()?) + .syntax(syntax.syntax()?) + .build_many(patterns) + .context("failed to compile meta regex") + } + + /// Build a meta regex from the HIRs given. pub fn from_hirs>( &self, hirs: &[H], @@ -44,6 +62,9 @@ impl Configurable for Config { arg: &mut Arg, ) -> anyhow::Result { match *arg { + Arg::Long("build-from-patterns") => { + self.build_from_patterns = true; + } Arg::Short('k') | Arg::Long("match-kind") => { let kind: flags::MatchKind = args::parse(p, "-k/--match-kind")?; @@ -52,7 +73,7 @@ impl Configurable for Config { Arg::Short('B') | Arg::Long("no-utf8-nfa") => { self.meta = self.meta.clone().utf8_empty(false); } - Arg::Long("--no-auto-prefilter") => { + Arg::Long("no-auto-prefilter") => { self.meta = self.meta.clone().auto_prefilter(false); } Arg::Long("nfa-size-limit") => { @@ -102,6 +123,27 @@ impl Configurable for Config { fn usage(&self) -> &[Usage] { const USAGES: &'static [Usage] = &[ + Usage::new( + "--build-from-patterns", + "Build a meta regex directly from pattern strings.", + r#" +Build a meta regex directly from pattern strings. + +By default, a meta regex is built in this tool by first explicitly parsing the +patterns into ASTs, then translating them into HIRs and finally providing the +HIRs to the meta regex builder. This flag changes the behavior to pass the +pattern strings directly to the meta regex builder such that the builder is +responsible for parsing and translating. + +The main reason to use this is if you specifically want to test the meta regex +builder from patterns directly, as it may contain optimizations for skipping +aspects of parsing. + +The default behavior splits these steps out in order to time them so that +one gets a good idea of where most time is being spent during meta regex +construction. +"#, + ), flags::MatchKind::USAGE, Usage::new( "-B, --no-utf8-nfa", diff --git a/regex-cli/cmd/debug/dfa.rs b/regex-cli/cmd/debug/dfa.rs index 6366aec214..9381cdadc8 100644 --- a/regex-cli/cmd/debug/dfa.rs +++ b/regex-cli/cmd/debug/dfa.rs @@ -85,7 +85,10 @@ OPTIONS: table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:?}", dfa)?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:?}", dfa)?; } Ok(()) } @@ -155,7 +158,10 @@ OPTIONS: table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:?}", re)?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:?}", re)?; } Ok(()) } @@ -236,7 +242,10 @@ OPTIONS: table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:?}", dfa)?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:?}", dfa)?; } Ok(()) } @@ -308,7 +317,10 @@ OPTIONS: table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:?}", re)?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:?}", re)?; } Ok(()) } diff --git a/regex-cli/cmd/debug/literal.rs b/regex-cli/cmd/debug/literal.rs index 0c0f21aa07..715e1bbb9c 100644 --- a/regex-cli/cmd/debug/literal.rs +++ b/regex-cli/cmd/debug/literal.rs @@ -91,7 +91,9 @@ OPTIONS: } if !common.quiet { let mut out = stdout(); - writeln!(out, "")?; + if common.table() { + writeln!(out, "")?; + } match seq.literals() { None => writeln!(out, "{:?}", seq)?, Some(literals) => { diff --git a/regex-cli/cmd/debug/mod.rs b/regex-cli/cmd/debug/mod.rs index 27a0d6dbe7..525c4d1383 100644 --- a/regex-cli/cmd/debug/mod.rs +++ b/regex-cli/cmd/debug/mod.rs @@ -76,7 +76,10 @@ OPTIONS: table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:#?}", &asts[0])?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:#?}", &asts[0])?; } Ok(()) } @@ -117,7 +120,10 @@ OPTIONS: table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:#?}", &hirs[0])?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:#?}", &hirs[0])?; } Ok(()) } @@ -172,7 +178,10 @@ OPTIONS: table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:?}", dfa)?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:?}", dfa)?; } Ok(()) } @@ -222,12 +231,14 @@ OPTIONS: ); table.add("lookset any", nfa.look_set_any()); table.add("lookset prefix any", nfa.look_set_prefix_any()); - table.add("lookset prefix all", nfa.look_set_prefix_all()); if common.table() { table.print(stdout())?; } if !common.quiet { - writeln!(stdout(), "\n{:?}", nfa)?; + if common.table() { + writeln!(stdout(), "")?; + } + writeln!(stdout(), "{:?}", nfa)?; } Ok(()) } diff --git a/regex-cli/cmd/find/capture/mod.rs b/regex-cli/cmd/find/capture/mod.rs index cb3574a166..3af46bfcff 100644 --- a/regex-cli/cmd/find/capture/mod.rs +++ b/regex-cli/cmd/find/capture/mod.rs @@ -187,12 +187,20 @@ OPTIONS: let pats = patterns.get()?; let mut table = Table::empty(); - let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; - table.add("parse time", time); - let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; - table.add("translate time", time); - let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; - table.add("build meta time", time); + + let re = if meta.build_from_patterns() { + let (re, time) = util::timeitr(|| meta.from_patterns(&syntax, &pats))?; + table.add("build meta time", time); + re + } else { + let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; + table.add("parse time", time); + let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; + table.add("translate time", time); + let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; + table.add("build meta time", time); + re + }; let search = |input: &Input<'_>, caps: &mut Captures| { Ok(re.search_captures(input, caps)) diff --git a/regex-cli/cmd/find/half/mod.rs b/regex-cli/cmd/find/half/mod.rs index 6c386b6535..a7239f234b 100644 --- a/regex-cli/cmd/find/half/mod.rs +++ b/regex-cli/cmd/find/half/mod.rs @@ -180,12 +180,20 @@ OPTIONS: let pats = patterns.get()?; let mut table = Table::empty(); - let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; - table.add("parse time", time); - let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; - table.add("translate time", time); - let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; - table.add("build meta time", time); + + let re = if meta.build_from_patterns() { + let (re, time) = util::timeitr(|| meta.from_patterns(&syntax, &pats))?; + table.add("build meta time", time); + re + } else { + let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; + table.add("parse time", time); + let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; + table.add("translate time", time); + let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; + table.add("build meta time", time); + re + }; let search = |input: &Input<'_>| Ok(re.search_half(input)); if find.count { diff --git a/regex-cli/cmd/find/match/mod.rs b/regex-cli/cmd/find/match/mod.rs index 9272513b49..0bafd8b2b4 100644 --- a/regex-cli/cmd/find/match/mod.rs +++ b/regex-cli/cmd/find/match/mod.rs @@ -142,12 +142,20 @@ OPTIONS: let pats = patterns.get()?; let mut table = Table::empty(); - let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; - table.add("parse time", time); - let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; - table.add("translate time", time); - let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; - table.add("build meta time", time); + + let re = if meta.build_from_patterns() { + let (re, time) = util::timeitr(|| meta.from_patterns(&syntax, &pats))?; + table.add("build meta time", time); + re + } else { + let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; + table.add("parse time", time); + let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; + table.add("translate time", time); + let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; + table.add("build meta time", time); + re + }; let search = |input: &Input<'_>| Ok(re.search(input)); if find.count { diff --git a/regex-cli/cmd/find/which/mod.rs b/regex-cli/cmd/find/which/mod.rs index 599b54903c..4416a76f89 100644 --- a/regex-cli/cmd/find/which/mod.rs +++ b/regex-cli/cmd/find/which/mod.rs @@ -171,12 +171,20 @@ OPTIONS: let pats = patterns.get()?; let mut table = Table::empty(); - let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; - table.add("parse time", time); - let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; - table.add("translate time", time); - let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; - table.add("build meta time", time); + + let re = if meta.build_from_patterns() { + let (re, time) = util::timeitr(|| meta.from_patterns(&syntax, &pats))?; + table.add("build meta time", time); + re + } else { + let (asts, time) = util::timeitr(|| syntax.asts(&pats))?; + table.add("parse time", time); + let (hirs, time) = util::timeitr(|| syntax.hirs(&pats, &asts))?; + table.add("translate time", time); + let (re, time) = util::timeitr(|| meta.from_hirs(&hirs))?; + table.add("build meta time", time); + re + }; let search = |input: &Input<'_>, patset: &mut PatternSet| { Ok(re.which_overlapping_matches(input, patset)) diff --git a/regex-cli/cmd/generate/serialize/dfa.rs b/regex-cli/cmd/generate/serialize/dfa.rs index abcbe6ec8c..1c7b409b3a 100644 --- a/regex-cli/cmd/generate/serialize/dfa.rs +++ b/regex-cli/cmd/generate/serialize/dfa.rs @@ -830,6 +830,13 @@ lazy_static::lazy_static! {{ let version = env!("CARGO_PKG_VERSION"); let cmd = std::env::args_os() .map(|a| a.to_string_lossy().into_owned()) + .map(|a| { + if a.contains('\n') { + "".to_string() + } else { + a + } + }) .collect::>() .join(" "); format!( diff --git a/regex-lite/Cargo.toml b/regex-lite/Cargo.toml index 6724f39e92..1dc144b316 100644 --- a/regex-lite/Cargo.toml +++ b/regex-lite/Cargo.toml @@ -15,9 +15,13 @@ autotests = false # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features +# +# (Currently there are no supported features. 'std' is technically one, but it +# is currently required.) [features] -default = ["std"] +default = ["std", "string"] std = [] +string = [] [dev-dependencies] anyhow = "1.0.69" @@ -30,7 +34,3 @@ name = "integration" [package.metadata.docs.rs] # We want to document all features. all-features = true -# To test this locally, run: -# -# RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features -rustdoc-args = ["--cfg", "docsrs"] diff --git a/regex-lite/README.md b/regex-lite/README.md index 00d7bdd40d..34c749b216 100644 --- a/regex-lite/README.md +++ b/regex-lite/README.md @@ -1 +1,129 @@ -WIP +regex-lite +========== +This crate provides a **lightweight** regex engine for searching strings. The +regex syntax supported by this crate is nearly identical to what is found in +the `regex` crate. Like the `regex` crate, all regex searches in this crate +have worst case `O(m * n)` time complexity, where `m` is proportional to the +size of the regex and `n` is proportional to the size of the string being +searched. + +[![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) +[![Crates.io](https://img.shields.io/crates/v/regex-lite.svg)](https://crates.io/crates/regex-lite) + + +### Documentation + +https://docs.rs/regex-lite + + +### Usage + +To bring this crate into your repository, either add `regex-lite` to your +`Cargo.toml`, or run `cargo add regex-lite`. + +Here's a simple example that matches a date in YYYY-MM-DD format and prints the +year, month and day: + +```rust +use regex_lite::Regex; + +fn main() { + let re = Regex::new(r"(?x) +(?P\d{4}) # the year +- +(?P\d{2}) # the month +- +(?P\d{2}) # the day +").unwrap(); + let caps = re.captures("2010-03-14").unwrap(); + + assert_eq!("2010", &caps["year"]); + assert_eq!("03", &caps["month"]); + assert_eq!("14", &caps["day"]); +} +``` + +If you have lots of dates in text that you'd like to iterate over, then it's +easy to adapt the above example with an iterator: + +```rust +use regex::Regex; + +const TO_SEARCH: &'static str = " +On 2010-03-14, foo happened. On 2014-10-14, bar happened. +"; + +fn main() { + let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); + + for caps in re.captures_iter(TO_SEARCH) { + // Note that all of the unwraps are actually OK for this regex + // because the only way for the regex to match is if all of the + // capture groups match. This is not true in general though! + println!("year: {}, month: {}, day: {}", + caps.get(1).unwrap().as_str(), + caps.get(2).unwrap().as_str(), + caps.get(3).unwrap().as_str()); + } +} +``` + +This example outputs: + +```text +year: 2010, month: 03, day: 14 +year: 2014, month: 10, day: 14 +``` + + +### Minimum Rust version policy + +This crate's minimum supported `rustc` version is `1.60.0`. + +The policy is that the minimum Rust version required to use this crate can be +increased in semver compatible updates. + + +### Motivation + +The primary purpose of this crate is to provide an alternative regex engine +for folks that are unhappy with the binary size and compilation time of the +primary `regex` crate. The `regex-lite` crate does the absolute minimum possible +to act as a drop-in replacement to the `regex` crate's `Regex` type. It avoids +a lot of complexity by choosing not to optimize searches and to opt out of +functionality such as robust Unicode support. By keeping the code simpler +and smaller, we get binary sizes and compile times that are substantially +better than even the `regex` crate with all of its features disabled. + +To make the benefits a bit more concrete, here are the results of one +experiment I did. For `regex`, I disabled all features except for `std`: + +* `regex 1.7.3`: 1.41s compile time, 373KB relative size increase +* `regex 1.8.1`: 1.46s compile time, 410KB relative size increase +* `regex 1.9.0`: 1.93s compile time, 565KB relative size increase +* `regex-lite 0.1.0`: 0.73s compile time, 94KB relative size increase + +The main reason why `regex-lite` does so much better than `regex` when all of +`regex`'s features are disabled is because of irreducible complexity. There are +certain parts of the code in `regex` that can't be arbitrarily divided based +on binary size and compile time goals. It's instead more sustainable to just +maintain an entirely separate crate. + +Ideas for improving the binary size and compile times of this crate even more +are most welcome. + + +### License + +This project is licensed under either of + + * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or + https://www.apache.org/licenses/LICENSE-2.0) + * MIT license ([LICENSE-MIT](LICENSE-MIT) or + https://opensource.org/licenses/MIT) + +at your option. + +The data in `regex-syntax/src/unicode_tables/` is licensed under the Unicode +License Agreement +([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)). diff --git a/regex-lite/src/error.rs b/regex-lite/src/error.rs index a6313aa8a6..c56c20c61a 100644 --- a/regex-lite/src/error.rs +++ b/regex-lite/src/error.rs @@ -1,3 +1,12 @@ +/// An error that occurred during parsing or compiling a regular expression. +/// +/// A parse error occurs when the syntax of the regex pattern is not +/// valid. Otherwise, a regex can still fail to build if it would +/// result in a machine that exceeds the configured size limit, via +/// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). +/// +/// This error type provides no introspection capabilities. The only thing you +/// can do with it is convert it to a string as a human readable error message. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Error { msg: &'static str, diff --git a/regex-lite/src/hir/mod.rs b/regex-lite/src/hir/mod.rs index dba3ea8c06..f73a5420ab 100644 --- a/regex-lite/src/hir/mod.rs +++ b/regex-lite/src/hir/mod.rs @@ -4,7 +4,7 @@ use crate::{error::Error, utf8}; mod parse; -/// Escapes all regular expression meta characters in `haystack`. +/// Escapes all regular expression meta characters in `pattern`. /// /// The string returned may be safely used as a literal in a regular /// expression. @@ -155,6 +155,7 @@ pub(crate) struct Hir { kind: HirKind, is_start_anchored: bool, is_match_empty: bool, + static_explicit_captures_len: Option, } #[derive(Clone, Debug, Eq, PartialEq)] @@ -199,24 +200,51 @@ impl Hir { self.is_match_empty } + /// If the pattern always reports the same number of matching capture groups + /// for every match, then this returns the number of those groups. This + /// doesn't include the implicit group found in every pattern. + pub(crate) fn static_explicit_captures_len(&self) -> Option { + self.static_explicit_captures_len + } + fn fail() -> Hir { let kind = HirKind::Class(Class { ranges: vec![] }); - Hir { kind, is_start_anchored: false, is_match_empty: false } + Hir { + kind, + is_start_anchored: false, + is_match_empty: false, + static_explicit_captures_len: Some(0), + } } fn empty() -> Hir { let kind = HirKind::Empty; - Hir { kind, is_start_anchored: false, is_match_empty: true } + Hir { + kind, + is_start_anchored: false, + is_match_empty: true, + static_explicit_captures_len: Some(0), + } } fn char(ch: char) -> Hir { let kind = HirKind::Char(ch); - Hir { kind, is_start_anchored: false, is_match_empty: false } + Hir { + kind, + is_start_anchored: false, + is_match_empty: false, + static_explicit_captures_len: Some(0), + } } fn class(class: Class) -> Hir { let kind = HirKind::Class(class); - Hir { kind, is_start_anchored: false, is_match_empty: false } + Hir { + kind, + is_start_anchored: false, + is_match_empty: false, + static_explicit_captures_len: Some(0), + } } fn look(look: Look) -> Hir { @@ -225,6 +253,7 @@ impl Hir { kind, is_start_anchored: matches!(look, Look::Start), is_match_empty: true, + static_explicit_captures_len: Some(0), } } @@ -236,15 +265,47 @@ impl Hir { } let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored; let is_match_empty = rep.min == 0 || rep.sub.is_match_empty; - let kind = HirKind::Repetition(rep); - Hir { kind, is_start_anchored, is_match_empty } + let mut static_explicit_captures_len = + rep.sub.static_explicit_captures_len; + // If the static captures len of the sub-expression is not known or + // is greater than zero, then it automatically propagates to the + // repetition, regardless of the repetition. Otherwise, it might + // change, but only when the repetition can match 0 times. + if rep.min == 0 + && static_explicit_captures_len.map_or(false, |len| len > 0) + { + // If we require a match 0 times, then our captures len is + // guaranteed to be zero. Otherwise, if we *can* match the empty + // string, then it's impossible to know how many captures will be + // in the resulting match. + if rep.max == Some(0) { + static_explicit_captures_len = Some(0); + } else { + static_explicit_captures_len = None; + } + } + Hir { + kind: HirKind::Repetition(rep), + is_start_anchored, + is_match_empty, + static_explicit_captures_len, + } } fn capture(cap: Capture) -> Hir { let is_start_anchored = cap.sub.is_start_anchored; let is_match_empty = cap.sub.is_match_empty; + let static_explicit_captures_len = cap + .sub + .static_explicit_captures_len + .map(|len| len.saturating_add(1)); let kind = HirKind::Capture(cap); - Hir { kind, is_start_anchored, is_match_empty } + Hir { + kind, + is_start_anchored, + is_match_empty, + static_explicit_captures_len, + } } fn concat(mut subs: Vec) -> Hir { @@ -254,9 +315,22 @@ impl Hir { subs.pop().unwrap() } else { let is_start_anchored = subs[0].is_start_anchored; - let is_match_empty = subs.iter().all(|s| s.is_match_empty); - let kind = HirKind::Concat(subs); - Hir { kind, is_start_anchored, is_match_empty } + let mut is_match_empty = true; + let mut static_explicit_captures_len = Some(0usize); + for sub in subs.iter() { + is_match_empty = is_match_empty && sub.is_match_empty; + static_explicit_captures_len = static_explicit_captures_len + .and_then(|len1| { + Some((len1, sub.static_explicit_captures_len?)) + }) + .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); + } + Hir { + kind: HirKind::Concat(subs), + is_start_anchored, + is_match_empty, + static_explicit_captures_len, + } } } @@ -266,10 +340,28 @@ impl Hir { } else if subs.len() == 1 { subs.pop().unwrap() } else { - let is_start_anchored = subs.iter().all(|s| s.is_start_anchored); - let is_match_empty = subs.iter().any(|s| s.is_match_empty); - let kind = HirKind::Alternation(subs); - Hir { kind, is_start_anchored, is_match_empty } + let mut it = subs.iter().peekable(); + let mut is_start_anchored = + it.peek().map_or(false, |sub| sub.is_start_anchored); + let mut is_match_empty = + it.peek().map_or(false, |sub| sub.is_match_empty); + let mut static_explicit_captures_len = + it.peek().and_then(|sub| sub.static_explicit_captures_len); + for sub in it { + is_start_anchored = is_start_anchored && sub.is_start_anchored; + is_match_empty = is_match_empty || sub.is_match_empty; + if static_explicit_captures_len + != sub.static_explicit_captures_len + { + static_explicit_captures_len = None; + } + } + Hir { + kind: HirKind::Alternation(subs), + is_start_anchored, + is_match_empty, + static_explicit_captures_len, + } } } } diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs index c4f7eb09f4..0b406d1d89 100644 --- a/regex-lite/src/hir/parse.rs +++ b/regex-lite/src/hir/parse.rs @@ -182,12 +182,12 @@ impl<'a> Parser<'a> { /// This returns the old depth. fn increment_depth(&self) -> Result { let old = self.depth.get(); + if old > self.config.nest_limit { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + } // OK because our depth starts at 0, and we return an error if it // ever reaches the limit. So the call depth can never exceed u32::MAX. let new = old.checked_add(1).unwrap(); - if new >= self.config.nest_limit { - return Err(Error::new(ERR_TOO_MUCH_NESTING)); - } self.depth.set(new); Ok(old) } @@ -1896,7 +1896,7 @@ bar fn err_standard() { assert_eq!( ERR_TOO_MUCH_NESTING, - perr("(((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))"), + perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"), ); // This one is tricky, because the only way it can happen is if the // number of captures overflows u32. Perhaps we should allow setting a diff --git a/regex-lite/src/interpolate.rs b/regex-lite/src/interpolate.rs index 3be9d10606..b01d25bdc9 100644 --- a/regex-lite/src/interpolate.rs +++ b/regex-lite/src/interpolate.rs @@ -234,8 +234,8 @@ fn find_cap_ref(replacement: &[u8]) -> Option> { let cap = core::str::from_utf8(&rep[i..cap_end]) .expect("valid UTF-8 capture name"); Some(CaptureRef { - cap: match cap.parse::() { - Ok(i) => Ref::Number(i as usize), + cap: match cap.parse::() { + Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: cap_end, @@ -263,8 +263,8 @@ fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { Ok(cap) => cap, }; Some(CaptureRef { - cap: match cap.parse::() { - Ok(i) => Ref::Number(i as usize), + cap: match cap.parse::() { + Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: i + 1, diff --git a/regex-lite/src/lib.rs b/regex-lite/src/lib.rs index e8b03d95ca..d8e9016788 100644 --- a/regex-lite/src/lib.rs +++ b/regex-lite/src/lib.rs @@ -1,5 +1,840 @@ /*! -TODO +This crate provides a **lightweight** regex engine for searching strings. The +regex syntax supported by this crate is nearly identical to what is found in +the [`regex`](https://docs.rs/regex) crate. Like the `regex` crate, all regex +searches in this crate have worst case `O(m * n)` time complexity, where `m` is +proportional to the size of the regex and `n` is proportional to the size of +the string being searched. + +The principal difference between the `regex` and `regex-lite` crates is that +the latter prioritizes smaller binary sizes and shorter Rust compile times +over performance and functionality. As a result, regex searches in this crate +are typically substantially slower than what is provided by the `regex` crate. +Moreover, this crate only has the most basic level of Unicode support: it +matches codepoint by codepoint but otherwise doesn't support Unicode case +insensivity or things like `\p{Letter}`. In exchange, this crate contributes +far less to binary size and compiles much more quickly. + +If you just want API documentation, then skip to the [`Regex`] type. Otherwise, +here's a quick example showing one way of parsing the output of a grep-like +program: + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"(?m)^([^:]+):([0-9]+):(.+)$").unwrap(); +let hay = "\ +path/to/foo:54:Blue Harvest +path/to/bar:90:Something, Something, Something, Dark Side +path/to/baz:3:It's a Trap! +"; + +let mut results = vec![]; +for (_, [path, lineno, line]) in re.captures_iter(hay).map(|c| c.extract()) { + results.push((path, lineno.parse::()?, line)); +} +assert_eq!(results, vec![ + ("path/to/foo", 54, "Blue Harvest"), + ("path/to/bar", 90, "Something, Something, Something, Dark Side"), + ("path/to/baz", 3, "It's a Trap!"), +]); +# Ok::<(), Box>(()) +``` + +# Overview + +The primary type in this crate is a [`Regex`]. Its most important methods are +as follows: + +* [`Regex::new`] compiles a regex using the default configuration. A +[`RegexBuilder`] permits setting a non-default configuration. (For example, +case insensitive matching, verbose mode and others.) +* [`Regex::is_match`] reports whether a match exists in a particular haystack. +* [`Regex::find`] reports the byte offsets of a match in a haystack, if one +exists. [`Regex::find_iter`] returns an iterator over all such matches. +* [`Regex::captures`] returns a [`Captures`], which reports both the byte +offsets of a match in a haystack and the byte offsets of each matching capture +group from the regex in the haystack. +[`Regex::captures_iter`] returns an iterator over all such matches. + +Otherwise, this top-level crate documentation is organized as follows: + +* [Usage](#usage) shows how to add the `regex` crate to your Rust project. +* [Examples](#examples) provides a limited selection of regex search examples. +* [Differences with the regex crate](#differences-with-the-regex-crate) +provides a precise description of how `regex-lite` differs from `regex`. +* [Syntax](#syntax) enumerates the specific regex syntax supported by this +crate. +* [Untrusted input](#untrusted-input) discusses how this crate deals with regex +patterns or haystacks that are untrusted. + +# Usage + +The `regex-lite` crate is [on crates.io](https://crates.io/crates/regex-lite) +and can be used by adding `regex-lite` to your dependencies in your project's +`Cargo.toml`. Or more simply, just run `cargo add regex-lite`. + +Here is a complete example that creates a new Rust project, adds a dependency +on `regex-lite`, creates the source code for a regex search and then runs the +program. + +First, create the project in a new directory: + +```text +$ mkdir regex-example +$ cd regex-example +$ cargo init +``` + +Second, add a dependency on `regex`: + +```text +$ cargo add regex-lite +``` + +Third, edit `src/main.rs`. Delete what's there and replace it with this: + +``` +use regex_lite::Regex; + +fn main() { + let re = Regex::new(r"Hello (?\w+)!").unwrap(); + let Some(caps) = re.captures("Hello Murphy!") else { + println!("no match!"); + return; + }; + println!("The name is: {}", &caps["name"]); +} +``` + +Foruth, run it with `cargo run`: + +```text +$ cargo run + Compiling regex-lite v0.1.0 + Compiling regex-example v0.1.0 (/tmp/regex-example) + Finished dev [unoptimized + debuginfo] target(s) in 4.22s + Running `target/debug/regex-example` +The name is: Murphy +``` + +The first time you run the program will show more output like above. But +subsequent runs shouldn't have to re-compile the dependencies. + +# Examples + +This section provides a few examples, in tutorial style, showing how to +search a haystack with a regex. There are more examples throughout the API +documentation. + +Before starting though, it's worth defining a few terms: + +* A **regex** is a Rust value whose type is `Regex`. We use `re` as a +variable name for a regex. +* A **pattern** is the string that is used to build a regex. We use `pat` as +a variable name for a pattern. +* A **haystack** is the string that is searched by a regex. We use `hay` as a +variable name for a haystack. + +Sometimes the words "regex" and "pattern" are used interchangeably. + +General use of regular expressions in this crate proceeds by compiling a +**pattern** into a **regex**, and then using that regex to search, split or +replace parts of a **haystack**. + +### Example: find a middle initial + +We'll start off with a very simple example: a regex that looks for a specific +name but uses a wildcard to match a middle initial. Our pattern serves as +something like a template that will match a particular name with *any* middle +initial. + +```rust +use regex_lite::Regex; + +// We use 'unwrap()' here because it would be a bug in our program if the +// pattern failed to compile to a regex. Panicking in the presence of a bug +// is okay. +let re = Regex::new(r"Homer (.)\. Simpson").unwrap(); +let hay = "Homer J. Simpson"; +let Some(caps) = re.captures(hay) else { return }; +assert_eq!("J", &caps[1]); +``` + +There are a few things worth noticing here in our first example: + +* The `.` is a special pattern meta character that means "match any single +character except for new lines." (More precisely, in this crate, it means +"match any UTF-8 encoding of any Unicode scalar value other than `\n`.") +* We can match an actual `.` literally by escaping it, i.e., `\.`. +* We use Rust's [raw strings] to avoid needing to deal with escape sequences in +both the regex pattern syntax and in Rust's string literal syntax. If we didn't +use raw strings here, we would have had to use `\\.` to match a literal `.` +character. That is, `r"\."` and `"\\."` are equivalent patterns. +* We put our wildcard `.` instruction in parentheses. These parentheses have a +special meaning that says, "make whatever part of the haystack matches within +these parentheses available as a capturing group." After finding a match, we +access this capture group with `&caps[1]`. + +[raw strings]: https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals + +Otherwise, we execute a search using `re.captures(hay)` and return from our +function if no match occurred. We then reference the middle initial by asking +for the part of the haystack that matched the capture group indexed at `1`. +(The capture group at index 0 is implicit and always corresponds to the entire +match. In this case, that's `Homer J. Simpson`.) + +### Example: named capture groups + +Continuing from our middle initial example above, we can tweak the pattern +slightly to give a name to the group that matches the middle initial: + +```rust +use regex_lite::Regex; + +// Note that (?P.) is a different way to spell the same thing. +let re = Regex::new(r"Homer (?.)\. Simpson").unwrap(); +let hay = "Homer J. Simpson"; +let Some(caps) = re.captures(hay) else { return }; +assert_eq!("J", &caps["middle"]); +``` + +Giving a name to a group can be useful when there are multiple groups in +a pattern. It makes the code referring to those groups a bit easier to +understand. + +### Example: validating a particular date format + +This examples shows how to confirm whether a haystack, in its entirety, matches +a particular date format: + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +assert!(re.is_match("2010-03-14")); +``` + +Notice the use of the `^` and `$` anchors. In this crate, every regex search is +run with an implicit `(?s:.)*?` at the beginning of its pattern, which allows +the regex to match anywhere in a haystack. Anchors, as above, can be used to +ensure that the full haystack matches a pattern. + +### Example: finding dates in a haystack + +In the previous example, we showed how one might validate that a haystack, +in its entirety, corresponded to a particular date format. But what if we wanted +to extract all things that look like dates in a specific format from a haystack? +To do this, we can use an iterator API to find all matches (notice that we've +removed the anchors): + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"\d{4}-\d{2}-\d{2}").unwrap(); +let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; +// 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack. +let dates: Vec<&str> = re.find_iter(hay).map(|m| m.as_str()).collect(); +assert_eq!(dates, vec![ + "1865-04-14", + "1881-07-02", + "1901-09-06", + "1963-11-22", +]); +``` + +We can also iterate over [`Captures`] values instead of [`Match`] values, and +that in turn permits accessing each component of the date via capturing groups: + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"(?\d{4})-(?\d{2})-(?\d{2})").unwrap(); +let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; +// 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack. +let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { + // The unwraps are okay because every capture group must match if the whole + // regex matches, and in this context, we know we have a match. + // + // Note that we use `caps.name("y").unwrap().as_str()` instead of + // `&caps["y"]` because the the lifetime of the former is the same as the + // lifetime of `hay` above, but the lifetime of the latter is tied to the + // lifetime of `caps` due to how the `Index` trait is defined. + let year = caps.name("y").unwrap().as_str(); + let month = caps.name("m").unwrap().as_str(); + let day = caps.name("d").unwrap().as_str(); + (year, month, day) +}).collect(); +assert_eq!(dates, vec![ + ("1865", "04", "14"), + ("1881", "07", "02"), + ("1901", "09", "06"), + ("1963", "11", "22"), +]); +``` + +### Example: simpler capture group extraction + +One can use [`Captures::extract`] to make the code from the previous example a +bit simpler in this case: + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); +let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; +let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { + let (_, [year, month, day]) = caps.extract(); + (year, month, day) +}).collect(); +assert_eq!(dates, vec![ + ("1865", "04", "14"), + ("1881", "07", "02"), + ("1901", "09", "06"), + ("1963", "11", "22"), +]); +``` + +`Captures::extract` works by ensuring that the number of matching groups match +the number of groups requested via the `[year, month, day]` syntax. If they do, +then the substrings for each corresponding capture group are automatically +returned in an appropriately sized array. Rust's syntax for pattern matching +arrays does the rest. + +### Example: replacement with named capture groups + +Building on the previous example, perhaps we'd like to rearrange the date +formats. This can be done by finding each match and replacing it with +something different. The [`Regex::replace_all`] routine provides a convenient +way to do this, including by supporting references to named groups in the +replacement string: + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"(?\d{4})-(?\d{2})-(?\d{2})").unwrap(); +let before = "1973-01-05, 1975-08-25 and 1980-10-18"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980"); +``` + +The replace methods are actually polymorphic in the replacement, which +provides more flexibility than is seen here. (See the documentation for +[`Regex::replace`] for more details.) + +### Example: verbose mode + +When your regex gets complicated, you might consider using something other +than regex. But if you stick with regex, you can use the `x` flag to enable +insignificant whitespace mode or "verbose mode." In this mode, whitespace +is treated as insignificant and one may write comments. This may make your +patterns easier to comprehend. + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"(?x) + (?P\d{4}) # the year + - + (?P\d{2}) # the month + - + (?P\d{2}) # the day +").unwrap(); + +let before = "1973-01-05, 1975-08-25 and 1980-10-18"; +let after = re.replace_all(before, "$m/$d/$y"); +assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980"); +``` + +If you wish to match against whitespace in this mode, you can still use `\s`, +`\n`, `\t`, etc. For escaping a single space character, you can escape it +directly with `\ `, use its hex character code `\x20` or temporarily disable +the `x` flag, e.g., `(?-x: )`. + +# Differences with the `regex` crate + +As mentioned in the introduction above, the purpose of this crate is to +prioritize small binary sizes and shorter Rust compilation times as much as +possible. Namely, while the `regex` crate tends to eschew both binary size +and compilation time in favor of faster searches and features, the +`regex-lite` crate gives up faster searches and some functionality in exchange +for smaller binary sizes and faster compilation times. + +The precise set of differences at the syntax level: + +* The Perl character classes are limited to ASCII codepoints. That is, +`\d` is `[0-9]`, `\s` is `[\t\n\v\f\r ]` and `\w` is `[0-9A-Za-z_]`. +* Unicode character classes of the form `\p{...}` and `\P{...}` are not +supported at all. Note though that things like `[^β]` are still supported and +will match any Unicode scalar value except for `β`. +* Case insensitive searching is limited to ASCII case insensitivity. +* Character class set operations other than union are not supported. That is, +difference (`--`), intersection (`&&`) and symmetric difference (`~~`) are +not available. These tend to be most useful with Unicode classes, which also +aren't available. +* Opt-in octal support is not available in this crate. + +And now at the API level: + +* Currently, this crate only supports searching `&str`. It does not have APIs +for searching `&[u8]` haystacks, although it is planned to add these in the +future if there's demand. +* There is no `RegexSet` in this crate and there are no plans to add it. +* The `Error` type in this crate is completely opaque. + +Other than these things, the `regex-lite` crate is intended to be a drop-in +replacement for the `regex` crate. In most cases, you can just replace `use +regex::Regex;` with `use regex_lite::Regex;` and everything will work. (Unless +you're depending on Unicode support in your regexes.) + +# Syntax + +The syntax supported in this crate is documented below. + +### Matching one character + +

+.             any character except new line (includes new line with s flag)
+[0-9]         any ASCII digit
+\d            digit ([0-9])
+\D            not digit
+
+ +### Character classes + +
+[xyz]         A character class matching either x, y or z (union).
+[^xyz]        A character class matching any character except x, y and z.
+[a-z]         A character class matching any character in range a-z.
+[[:alpha:]]   ASCII character class ([A-Za-z])
+[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
+[\[\]]        Escaping in character classes (matching [ or ])
+
+ +Any ASCII or Perl character class may appear inside a bracketed `[...]` character +class. For example, `[\s[:digit:]]` matches any digit or space character. + +Precedence in character classes, from most binding to least: + +1. Ranges: `[a-cd]` == `[[a-c]d]` +2. Union: `[ab&&bc]` == `[[ab]&&[bc]]` +3. Negation: `[^a-z&&b]` == `[^[a-z&&b]]`. + +### Composites + +
+xy    concatenation (x followed by y)
+x|y   alternation (x or y, prefer x)
+
+ +This example shows how an alternation works, and what it means to prefer a +branch in the alternation over subsequent branches. + +``` +use regex_lite::Regex; + +let haystack = "samwise"; +// If 'samwise' comes first in our alternation, then it is +// preferred as a match, even if the regex engine could +// technically detect that 'sam' led to a match earlier. +let re = Regex::new(r"samwise|sam").unwrap(); +assert_eq!("samwise", re.find(haystack).unwrap().as_str()); +// But if 'sam' comes first, then it will match instead. +// In this case, it is impossible for 'samwise' to match +// because 'sam' is a prefix of it. +let re = Regex::new(r"sam|samwise").unwrap(); +assert_eq!("sam", re.find(haystack).unwrap().as_str()); +``` + +### Repetitions + +
+x*        zero or more of x (greedy)
+x+        one or more of x (greedy)
+x?        zero or one of x (greedy)
+x*?       zero or more of x (ungreedy/lazy)
+x+?       one or more of x (ungreedy/lazy)
+x??       zero or one of x (ungreedy/lazy)
+x{n,m}    at least n x and at most m x (greedy)
+x{n,}     at least n x (greedy)
+x{n}      exactly n x
+x{n,m}?   at least n x and at most m x (ungreedy/lazy)
+x{n,}?    at least n x (ungreedy/lazy)
+x{n}?     exactly n x
+
+ +### Empty matches + +
+^     the beginning of a haystack (or start-of-line with multi-line mode)
+$     the end of a haystack (or end-of-line with multi-line mode)
+\A    only the beginning of a haystack (even with multi-line mode enabled)
+\z    only the end of a haystack (even with multi-line mode enabled)
+\b    an ASCII word boundary (\w on one side and \W, \A, or \z on other)
+\B    not an ASCII word boundary
+
+ +The empty regex is valid and matches the empty string. For example, the +empty regex matches `abc` at positions `0`, `1`, `2` and `3`. When using the +top-level [`Regex`] on `&str` haystacks, an empty match that splits a codepoint +is guaranteed to never be returned. For example: + +```rust +let re = regex_lite::Regex::new(r"").unwrap(); +let ranges: Vec<_> = re.find_iter("💩").map(|m| m.range()).collect(); +assert_eq!(ranges, vec![0..0, 4..4]); +``` + +Note that an empty regex is distinct from a regex that can never match. For +example, the regex `[^\s\S]` is a character class that represents the negation +of `[\s\S]`, where the union of `\s` and `\S` corresponds to all Unicode scalar +values. The negation of everything is nothing, which means the character class +is empty. Since nothing is in the empty set, `[^\s\S]` matches nothing, not +even the empty string. + +### Grouping and flags + +
+(exp)          numbered capture group (indexed by opening parenthesis)
+(?P<name>exp)  named (also numbered) capture group (names must be alpha-numeric)
+(?<name>exp)   named (also numbered) capture group (names must be alpha-numeric)
+(?:exp)        non-capturing group
+(?flags)       set flags within current group
+(?flags:exp)   set flags for exp (non-capturing)
+
+ +Capture group names must be any sequence of alpha-numeric Unicode codepoints, +in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or +an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` +Unicode property, while numeric codepoints correspond to the union of the +`Decimal_Number`, `Letter_Number` and `Other_Number` general categories. + +Flags are each a single character. For example, `(?x)` sets the flag `x` +and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at +the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets +the `x` flag and clears the `y` flag. + +All flags are by default disabled unless stated otherwise. They are: + +
+i     case-insensitive: letters match both upper and lower case
+m     multi-line mode: ^ and $ match begin/end of line
+s     allow . to match \n
+R     enables CRLF mode: when multi-line mode is enabled, \r\n is used
+U     swap the meaning of x* and x*?
+x     verbose mode, ignores whitespace and allow line comments (starting with `#`)
+
+ +Note that in verbose mode, whitespace is ignored everywhere, including within +character classes. To insert whitespace, use its escaped form or a hex literal. +For example, `\ ` or `\x20` for an ASCII space. + +Flags can be toggled within a pattern. Here's an example that matches +case-insensitively for the first part but case-sensitively for the second part: + +```rust +use regex_lite::Regex; + +let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); +let m = re.find("AaAaAbbBBBb").unwrap(); +assert_eq!(m.as_str(), "AaAaAbb"); +``` + +Notice that the `a+` matches either `a` or `A`, but the `b+` only matches +`b`. + +Multi-line mode means `^` and `$` no longer match just at the beginning/end of +the input, but also at the beginning/end of lines: + +``` +use regex_lite::Regex; + +let re = Regex::new(r"(?m)^line \d+").unwrap(); +let m = re.find("line one\nline 2\n").unwrap(); +assert_eq!(m.as_str(), "line 2"); +``` + +Note that `^` matches after new lines, even at the end of input: + +``` +use regex_lite::Regex; + +let re = Regex::new(r"(?m)^").unwrap(); +let m = re.find_iter("test\n").last().unwrap(); +assert_eq!((m.start(), m.end()), (5, 5)); +``` + +When both CRLF mode and multi-line mode are enabled, then `^` and `$` will +match either `\r` and `\n`, but never in the middle of a `\r\n`: + +``` +use regex_lite::Regex; + +let re = Regex::new(r"(?mR)^foo$").unwrap(); +let m = re.find("\r\nfoo\r\n").unwrap(); +assert_eq!(m.as_str(), "foo"); +``` + +### Escape sequences + +Note that this includes all possible escape sequences, even ones that are +documented elsewhere. + +
+\*          literal *, applies to all ASCII except [0-9A-Za-z<>]
+\a          bell (\x07)
+\f          form feed (\x0C)
+\t          horizontal tab
+\n          new line
+\r          carriage return
+\v          vertical tab (\x0B)
+\A          matches at the beginning of a haystack
+\z          matches at the end of a haystack
+\b          word boundary assertion
+\B          negated word boundary assertion
+\x7F        hex character code (exactly two digits)
+\x{10FFFF}  any hex character code corresponding to a Unicode code point
+\u007F      hex character code (exactly four digits)
+\u{7F}      any hex character code corresponding to a Unicode code point
+\U0000007F  hex character code (exactly eight digits)
+\U{7F}      any hex character code corresponding to a Unicode code point
+\d, \s, \w  Perl character class
+\D, \S, \W  negated Perl character class
+
+ +### Perl character classes (ASCII only) + +These character classes are short-hands for common groups of characters. In +this crate, `\d`, `\s` and `\w` only consist of ASCII codepoints. + +
+\d     digit ([0-9])
+\D     not digit
+\s     whitespace ([\t\n\v\f\r ])
+\S     not whitespace
+\w     word character ([0-9A-Za-z_])
+\W     not word character
+
+ +### ASCII character classes + +These reflect additional groups of characters taken from POSIX regex syntax +that are sometimes useful to have. In this crate, all of these classes only +consist of ASCII codepoints. + +
+[[:alnum:]]    alphanumeric ([0-9A-Za-z])
+[[:alpha:]]    alphabetic ([A-Za-z])
+[[:ascii:]]    ASCII ([\x00-\x7F])
+[[:blank:]]    blank ([\t ])
+[[:cntrl:]]    control ([\x00-\x1F\x7F])
+[[:digit:]]    digits ([0-9])
+[[:graph:]]    graphical ([!-~])
+[[:lower:]]    lower case ([a-z])
+[[:print:]]    printable ([ -~])
+[[:punct:]]    punctuation ([!-/:-@\[-`{-~])
+[[:space:]]    whitespace ([\t\n\v\f\r ])
+[[:upper:]]    upper case ([A-Z])
+[[:word:]]     word characters ([0-9A-Za-z_])
+[[:xdigit:]]   hex digit ([0-9A-Fa-f])
+
+ +# Untrusted input + +This crate is meant to be able to run regex searches on untrusted haystacks +without fear of [ReDoS]. This crate also, to a certain extent, supports +untrusted patterns. + +[ReDoS]: https://en.wikipedia.org/wiki/ReDoS + +This crate differs from most (but not all) other regex engines in that it +doesn't use unbounded backtracking to run a regex search. In those cases, +one generally cannot use untrusted patterns *or* untrusted haystacks because +it can be very difficult to know whether a particular pattern will result in +catastrophic backtracking or not. + +We'll first discuss how this crate deals with untrusted inputs and then wrap +it up with a realistic discussion about what practice really looks like. + +### Panics + +Outside of clearly documented cases, most APIs in this crate are intended to +never panic regardless of the inputs given to them. For example, `Regex::new`, +`Regex::is_match`, `Regex::find` and `Regex::captures` should never panic. That +is, it is an API promise that those APIs will never panic no matter what inputs +are given to them. With that said, regex engines are complicated beasts, and +providing a rock solid guarantee that these APIs literally never panic is +essentially equivalent to saying, "there are no bugs in this library." That is +a bold claim, and not really one that can be feasibly made with a straight +face. + +Don't get the wrong impression here. This crate is extensively tested, not just +with unit and integration tests, but also via fuzz testing. For example, this +crate is part of the [OSS-fuzz project]. Panics should be incredibly rare, but +it is possible for bugs to exist, and thus possible for a panic to occur. If +you need a rock solid guarantee against panics, then you should wrap calls into +this library with [`std::panic::catch_unwind`]. + +It's also worth pointing out that this library will generally panic when other +regex engines would commit undefined behavior. When undefined behavior occurs, +your program might continue as if nothing bad has happened, but it also might +mean your program is open to the worst kinds of exploits. In contrast, the +worst thing a panic can do is a denial of service. + +[OSS-fuzz project]: https://android.googlesource.com/platform/external/oss-fuzz/+/refs/tags/android-t-preview-1/projects/rust-regex/ +[`std::panic::catch_unwind`]: https://doc.rust-lang.org/std/panic/fn.catch_unwind.html + +### Untrusted patterns + +The principal way this crate deals with them is by limiting their size by +default. The size limit can be configured via [`RegexBuilder::size_limit`]. The +idea of a size limit is that compiling a pattern into a `Regex` will fail if it +becomes "too big." Namely, while *most* resources consumed by compiling a regex +are approximately proportional to the length of the pattern itself, there is +one particular exception to this: counted repetitions. Namely, this pattern: + +```text +a{5}{5}{5}{5}{5}{5} +``` + +Is equivalent to this pattern: + +```text +a{15625} +``` + +In both of these cases, the actual pattern string is quite small, but the +resulting `Regex` value is quite large. Indeed, as the first pattern shows, +it isn't enough to locally limit the size of each repetition because they can +be stacked in a way that results in exponential growth. + +To provide a bit more context, a simplified view of regex compilation looks +like this: + +* The pattern string is parsed into a structured representation called an HIR +(high-level intermediate representation). Counted repetitions are not expanded +in this stage. That is, the size of the HIR is proportional to the size +of the pattern with "reasonable" constant factors. In other words, one can +reasonably limit the memory used by an HIR by limiting the length of the +pattern string. +* The HIR is compiled into a [Thompson NFA]. This is the stage at which +something like `\w{5}` is rewritten to `\w\w\w\w\w`. Thus, this is the stage +at which [`RegexBuilder::size_limit`] is enforced. If the NFA exceeds the +configured size, then this stage will fail. + +[Thompson NFA]: https://en.wikipedia.org/wiki/Thompson%27s_construction + +The size limit helps avoid two different kinds of exorbitant resource usage: + +* It avoids permitting exponential memory usage based on the size of the +pattern string. +* It avoids long search times. This will be discussed in more detail in the +next section, but worst case search time *is* dependent on the size of the +regex. So keeping regexes limited to a reasonable size is also a way of keeping +search times reasonable. + +Finally, it's worth pointing out that regex compilation is guaranteed to take +worst case `O(m)` time, where `m` is proportional to the size of regex. The +size of the regex here is *after* the counted repetitions have been expanded. + +**Advice for those using untrusted regexes**: limit the pattern length to +something small and expand it as needed. Configure [`RegexBuilder::size_limit`] +to something small and then expand it as needed. + +### Untrusted haystacks + +The main way this crate guards against searches from taking a long time is by +using algorithms that guarantee a `O(m * n)` worst case time and space bound. +Namely: + +* `m` is proportional to the size of the regex, where the size of the regex +includes the expansion of all counted repetitions. (See the previous section on +untrusted patterns.) +* `n` is proportional to the length, in bytes, of the haystack. + +In other words, if you consider `m` to be a constant (for example, the regex +pattern is a literal in the source code), then the search can be said to run +in "linear time." Or equivalently, "linear time with respect to the size of the +haystack." + +But the `m` factor here is important not to ignore. If a regex is +particularly big, the search times can get quite slow. This is why, in part, +[`RegexBuilder::size_limit`] exists. + +**Advice for those searching untrusted haystacks**: As long as your regexes +are not enormous, you should expect to be able to search untrusted haystacks +without fear. If you aren't sure, you should benchmark it. Unlike backtracking +engines, if your regex is so big that it's likely to result in slow searches, +this is probably something you'll be able to observe regardless of what the +haystack is made up of. + +### Iterating over matches + +One thing that is perhaps easy to miss is that the worst case time +complexity bound of `O(m * n)` applies to methods like [`Regex::is_match`], +[`Regex::find`] and [`Regex::captures`]. It does **not** apply to +[`Regex::find_iter`] or [`Regex::captures_iter`]. Namely, since iterating over +all matches can execute many searches, and each search can scan the entire +haystack, the worst case time complexity for iterators is `O(m * n^2)`. + +One example of where this occurs is when a pattern consists of an alternation, +where an earlier branch of the alternation requires scanning the entire +haystack only to discover that there is no match. It also requires a later +branch of the alternation to have matched at the beginning of the search. For +example, consider the pattern `.*[^A-Z]|[A-Z]` and the haystack `AAAAA`. The +first search will scan to the end looking for matches of `.*[^A-Z]` even though +a finite automata engine (as in this crate) knows that `[A-Z]` has already +matched the first character of the haystack. This is due to the greedy nature +of regex searching. That first search will report a match at the first `A` only +after scanning to the end to discover that no other match exists. The next +search then begins at the second `A` and the behavior repeats. + +There is no way to avoid this. This means that if both patterns and haystacks +are untrusted and you're iterating over all matches, you're susceptible +to worst case quadratic time complexity. One possible way to mitigate +this is to switch to the lower level `regex-automata` crate and use its +`meta::Regex` iterator APIs. There, you can configure the search to operate +in "earliest" mode by passing a `Input::new(haystack).earliest(true)` to +`meta::Regex::find_iter` (for example). By enabling this mode, you give up +the normal greedy match semantics of regex searches and instead ask the regex +engine to immediately stop as soon as a match has been found. Enabling this +mode will thus restore the worst case `O(m * n)` time complexity bound, but at +the cost of different semantics. + +### Untrusted inputs in practice + +While providing a `O(m * n)` worst case time bound on all searches goes a long +way toward preventing [ReDoS], that doesn't mean every search you can possibly +run will complete without burning CPU time. In general, there are a few ways +for the `m * n` time bound to still bite you: + +* You are searching an exceptionally long haystack. No matter how you slice +it, a longer haystack will take more time to search. +* Very large regexes can searches to be quite slow due to increasing the size +`m` in the worst case `O(m * n)` bound. This is especially true when they +are combined with counted repetitions. While the regex size limit above will +protect you from the most egregious cases, the the default size limit still +permits pretty big regexes that can execute more slowly than one might expect. +* While routines like [`Regex::find`] and [`Regex::captures`] guarantee +worst case `O(m * n)` search time, routines like [`Regex::find_iter`] and +[`Regex::captures_iter`] actually have worst case `O(m * n^2)` search time. +This is because `find_iter` runs many searches, and each search takes worst +case `O(m * n)` time. Thus, iteration of all matches in a haystack has +worst case `O(m * n^2)`. A good example of a pattern that exhibits this is +`(?:A+){1000}|` or even `.*[^A-Z]|[A-Z]`. + +In general, unstrusted haystacks are easier to stomach than untrusted patterns. +Untrusted patterns give a lot more control to the caller to impact the +performance of a search. Therefore, permitting untrusted patterns means that +your only line of defense is to put a limit on how big `m` (and perhaps also +`n`) can be in `O(m * n)`. `n` is limited by simply inspecting the length +of the haystack while `m` is limited by *both* applying a limit to the +length of the pattern *and* a limit on the compiled size of the regex via +[`RegexBuilder::size_limit`]. + +It bears repeating: if you're accepting untrusted patterns, it would be a good +idea to start with conservative limits on `m` and `n`, and then carefully +increase them as needed. */ #![no_std] @@ -9,9 +844,12 @@ TODO // we need some way to synchronize access to a PikeVM cache. That in turn will // likely require rolling our own primitive spin-lock or similar structure. #![forbid(unsafe_code)] -// #![deny(missing_docs, rustdoc::broken_intra_doc_links)] +#![deny(missing_docs, rustdoc::broken_intra_doc_links)] #![warn(missing_debug_implementations)] -#![cfg_attr(docsrs, feature(doc_auto_cfg))] +// When the main features are disabled, squash dead code warnings. The +// alternative is to litter conditional compilation directives everywhere, +// which is super annoying. +#![cfg_attr(not(feature = "string"), allow(dead_code))] #[cfg(not(feature = "std"))] compile_error!("'std' is currently a required feature, please file an issue"); @@ -23,7 +861,9 @@ extern crate alloc; #[cfg(any(test, feature = "std"))] extern crate std; -pub use self::{error::Error, hir::escape, string::*}; +#[cfg(feature = "string")] +pub use self::string::*; +pub use self::{error::Error, hir::escape}; mod error; mod hir; @@ -32,5 +872,6 @@ mod interpolate; mod nfa; mod pikevm; mod pool; +#[cfg(feature = "string")] mod string; mod utf8; diff --git a/regex-lite/src/nfa.rs b/regex-lite/src/nfa.rs index 5802c48683..12404dab61 100644 --- a/regex-lite/src/nfa.rs +++ b/regex-lite/src/nfa.rs @@ -35,6 +35,9 @@ pub(crate) struct NFA { is_start_anchored: bool, /// Whether this NFA can match the empty string. is_match_empty: bool, + /// If every match has the same number of matching capture groups, then + /// this corresponds to the number of groups. + static_explicit_captures_len: Option, /// A map from capture group name to its corresponding index. cap_name_to_index: CaptureNameMap, /// A map from capture group index to the corresponding name, if one @@ -87,6 +90,14 @@ impl NFA { self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) } + /* + /// Returns the capture group name for the corresponding index. + /// If no such group with the given index, then `None` is returned. + pub(crate) fn to_name(&self, index: usize) -> Option<&str> { + self.cap_index_to_name.get(index)?.as_deref() + } + */ + /// Returns an iterator over all of the capture groups, along with their /// names if they exist, in this NFA. pub(crate) fn capture_names(&self) -> CaptureNames<'_> { @@ -105,6 +116,13 @@ impl NFA { self.is_start_anchored } + /// If the pattern always reports the same number of matching capture groups + /// for every match, then this returns the number of those groups. This + /// doesn't include the implicit group found in every pattern. + pub(crate) fn static_explicit_captures_len(&self) -> Option { + self.static_explicit_captures_len + } + /// Returns the heap memory usage, in bytes, used by this NFA. fn memory_usage(&self) -> usize { (self.states.len() * size_of::()) @@ -252,6 +270,7 @@ impl Compiler { start: 0, is_start_anchored: false, is_match_empty: false, + static_explicit_captures_len: None, cap_name_to_index: CaptureNameMap::default(), cap_index_to_name: vec![], memory_extra: 0, @@ -262,6 +281,8 @@ impl Compiler { fn compile(self, hir: &Hir) -> Result { self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); + self.nfa.borrow_mut().static_explicit_captures_len = + hir.static_explicit_captures_len(); let compiled = self.c_capture(0, None, hir)?; let mat = self.add(State::Match)?; self.patch(compiled.end, mat)?; diff --git a/regex-lite/src/string.rs b/regex-lite/src/string.rs index 6eab40afe9..358f7a0c75 100644 --- a/regex-lite/src/string.rs +++ b/regex-lite/src/string.rs @@ -13,25 +13,224 @@ use crate::{ pool::CachePool, }; -#[derive(Debug)] +/// A compiled regular expression for searching Unicode haystacks. +/// +/// A `Regex` can be used to search haystacks, split haystacks into substrings +/// or replace substrings in a haystack with a different substring. All +/// searching is done with an implicit `(?s:.)*?` at the beginning and end of +/// an pattern. To force an expression to match the whole string (or a prefix +/// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`). +/// +/// While this crate will handle Unicode strings (whether in the regular +/// expression or in the haystack), all positions returned are **byte +/// offsets**. Every byte offset is guaranteed to be at a Unicode code point +/// boundary. That is, all offsets returned by the `Regex` API are guaranteed +/// to be ranges that can slice a `&str` without panicking. +/// +/// The only methods that allocate new strings are the string replacement +/// methods. All other methods (searching and splitting) return borrowed +/// references into the haystack given. +/// +/// # Example +/// +/// Find the offsets of a US phone number: +/// +/// ``` +/// use regex_lite::Regex; +/// +/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); +/// let m = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!(7..19, m.range()); +/// ``` +/// +/// # Example: extracting capture groups +/// +/// A common way to use regexes is with capture groups. That is, instead of +/// just looking for matches of an entire regex, parentheses are used to create +/// groups that represent part of the match. +/// +/// For example, consider a haystack with multiple lines, and each line has +/// three whitespace delimited fields where the second field is expected to be +/// a number and the third field a boolean. To make this convenient, we use +/// the [`Captures::extract`] API to put the strings that match each group +/// into a fixed size array: +/// +/// ``` +/// use regex_lite::Regex; +/// +/// let hay = " +/// rabbit 54 true +/// groundhog 2 true +/// does not match +/// fox 109 false +/// "; +/// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap(); +/// let mut fields: Vec<(&str, i64, bool)> = vec![]; +/// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) { +/// fields.push((f1, f2.parse()?, f3.parse()?)); +/// } +/// assert_eq!(fields, vec![ +/// ("rabbit", 54, true), +/// ("groundhog", 2, true), +/// ("fox", 109, false), +/// ]); +/// +/// # Ok::<(), Box>(()) +/// ``` pub struct Regex { pikevm: Arc, pool: CachePool, } +impl Clone for Regex { + fn clone(&self) -> Regex { + let pikevm = Arc::clone(&self.pikevm); + let pool = { + let pikevm = Arc::clone(&self.pikevm); + let create = Box::new(move || Cache::new(&pikevm)); + CachePool::new(create) + }; + Regex { pikevm, pool } + } +} + +impl core::fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl core::fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_tuple("Regex").field(&self.as_str()).finish() + } +} + +impl core::str::FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result { + Regex::new(s) + } +} + +/// Core regular expression methods. impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace substrings in a haystack. + /// + /// Note that regex compilation tends to be a somewhat expensive process, + /// and unlike higher level environments, compilation is not automatically + /// cached for you. One should endeavor to compile a regex once and then + /// reuse it. For example, it's a bad idea to compile the same regex + /// repeatedly in a loop. + /// + /// # Errors + /// + /// If an invalid pattern is given, then an error is returned. + /// An error is also returned if the pattern is valid, but would + /// produce a regex that is bigger than the configured size limit via + /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by + /// default.) + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// // An Invalid pattern because of an unclosed parenthesis + /// assert!(Regex::new(r"foo(bar").is_err()); + /// // An invalid pattern because the regex would be too big + /// // because Unicode tends to inflate things. + /// assert!(Regex::new(r"\w{1000000}").is_err()); + /// ``` pub fn new(pattern: &str) -> Result { RegexBuilder::new(pattern).build() } + /// Returns true if and only if there is a match for the regex anywhere + /// in the haystack given. + /// + /// It is recommended to use this method if all you need to do is test + /// whether a match exists, since the underlying matching engine may be + /// able to do less work. + /// + /// # Example + /// + /// Test if some haystack contains at least one word with exactly 13 + /// word characters: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = "I categorically deny having triskaidekaphobia."; + /// assert!(re.is_match(hay)); + /// ``` + #[inline] pub fn is_match(&self, haystack: &str) -> bool { self.is_match_at(haystack, 0) } + /// This routine searches for the first match of this regex in the + /// haystack given, and if found, returns a [`Match`]. The `Match` + /// provides access to both the byte offsets of the match and the actual + /// substring that matched. + /// + /// Note that this should only be used if you want to find the entire + /// match. If instead you just want to test the existence of a match, + /// it's potentially faster to use `Regex::is_match(hay)` instead of + /// `Regex::find(hay).is_some()`. + /// + /// # Example + /// + /// Find the first word with exactly 13 word characters: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = "I categorically deny having triskaidekaphobia."; + /// let mat = re.find(hay).unwrap(); + /// assert_eq!(2..15, mat.range()); + /// assert_eq!("categorically", mat.as_str()); + /// ``` + #[inline] pub fn find<'h>(&self, haystack: &'h str) -> Option> { self.find_at(haystack, 0) } + /// Returns an iterator that yields successive non-overlapping matches in + /// the given haystack. The iterator yields values of type [`Match`]. + /// + /// # Time complexity + /// + /// Note that since `find_iter` runs potentially many searches on the + /// haystack and since each search has worst case `O(m * n)` time + /// complexity, the overall worst case time complexity for iteration is + /// `O(m * n^2)`. + /// + /// # Example + /// + /// Find every word with exactly 13 word characters: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = "Retroactively relinquishing remunerations is reprehensible."; + /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_str()).collect(); + /// assert_eq!(matches, vec![ + /// "Retroactively", + /// "relinquishing", + /// "remunerations", + /// "reprehensible", + /// ]); + /// ``` + #[inline] pub fn find_iter<'r, 'h>(&'r self, haystack: &'h str) -> Matches<'r, 'h> { Matches { haystack, @@ -39,10 +238,158 @@ impl Regex { } } + /// This routine searches for the first match of this regex in the haystack + /// given, and if found, returns not only the overall match but also the + /// matches of each capture group in the regex. If no match is found, then + /// `None` is returned. + /// + /// Capture group `0` always corresponds to an implicit unnamed group that + /// includes the entire match. If a match is found, this group is always + /// present. Subsequent groups may be named and are numbered, starting + /// at 1, by the order in which the opening parenthesis appears in the + /// pattern. For example, in the pattern `(?.(?.))(?.)`, `a`, + /// `b` and `c` correspond to capture group indices `1`, `2` and `3`, + /// respectively. + /// + /// You should only use `captures` if you need access to the capture group + /// matches. Otherwise, [`Regex::find`] is generally faster for discovering + /// just the overall match. + /// + /// # Example + /// + /// Say you have some haystack with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for + /// substrings looking like that, while also extracting the movie name and + /// its release year separately. The example below shows how to do that. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(hay).unwrap(); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. In this case, these + /// // accesses are always correct because the overall regex will only + /// // match when these capture groups match. + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// assert_eq!(&caps[1], "Citizen Kane"); + /// assert_eq!(&caps[2], "1941"); + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"'(?[^']+)'\s+\((?<year>\d{4})\)").unwrap(); + /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(hay).unwrap(); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_str(), "1941"); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. In this case, + /// // these accesses are always correct because the overall regex will + /// // only match when these capture groups match. + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + /// + /// Finally, one other way to to get the matched substrings is with the + /// [`Captures::extract`] API: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let (full, [title, year]) = re.captures(hay).unwrap().extract(); + /// assert_eq!(full, "'Citizen Kane' (1941)"); + /// assert_eq!(title, "Citizen Kane"); + /// assert_eq!(year, "1941"); + /// ``` + #[inline] pub fn captures<'h>(&self, haystack: &'h str) -> Option<Captures<'h>> { self.captures_at(haystack, 0) } + /// Returns an iterator that yields successive non-overlapping matches in + /// the given haystack. The iterator yields values of type [`Captures`]. + /// + /// This is the same as [`Regex::find_iter`], but instead of only providing + /// access to the overall match, each value yield includes access to the + /// matches of all capture groups in the regex. Reporting this extra match + /// data is potentially costly, so callers should only use `captures_iter` + /// over `find_iter` when they actually need access to the capture group + /// matches. + /// + /// # Time complexity + /// + /// Note that since `captures_iter` runs potentially many searches on the + /// haystack and since each search has worst case `O(m * n)` time + /// complexity, the overall worst case time complexity for iteration is + /// `O(m * n^2)`. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some haystack, where the movie is formatted like "'Title' (xxxx)": + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap(); + /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// let mut movies = vec![]; + /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) { + /// movies.push((title, year.parse::<i64>()?)); + /// } + /// assert_eq!(movies, vec![ + /// ("Citizen Kane", 1941), + /// ("The Wizard of Oz", 1939), + /// ("M", 1931), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Or with named groups: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap(); + /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// let mut it = re.captures_iter(hay); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], "The Wizard of Oz"); + /// assert_eq!(&caps["year"], "1939"); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], "M"); + /// assert_eq!(&caps["year"], "1931"); + /// ``` + #[inline] pub fn captures_iter<'r, 'h>( &'r self, haystack: &'h str, @@ -56,10 +403,207 @@ impl Regex { } } + /// Returns an iterator of substrings of the haystack given, delimited by a + /// match of the regex. Namely, each element of the iterator corresponds to + /// a part of the haystack that *isn't* matched by the regular expression. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let hay = "a b \t c\td e"; + /// let fields: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); + /// ``` + /// + /// # Example: more cases + /// + /// Basic usage: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = "Mary had a little lamb"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = ""; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec![""]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "lionXXtigerXleopard"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]); + /// + /// let re = Regex::new(r"::").unwrap(); + /// let hay = "lion::tiger::leopard"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["lion", "tiger", "leopard"]); + /// ``` + /// + /// If a haystack contains multiple contiguous matches, you will end up + /// with empty spans yielded by the iterator: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "XXXXaXXbXc"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); + /// + /// let re = Regex::new(r"/").unwrap(); + /// let hay = "(///)"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["(", "", "", ")"]); + /// ``` + /// + /// Separators at the start or end of a haystack are neighbored by empty + /// substring. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"0").unwrap(); + /// let hay = "010"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "1", ""]); + /// ``` + /// + /// When the empty string is used as a regex, it splits at every valid + /// UTF-8 boundary by default (which includes the beginning and end of the + /// haystack): + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let hay = "rust"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]); + /// + /// // Splitting by an empty string is UTF-8 aware by default! + /// let re = Regex::new(r"").unwrap(); + /// let hay = "☃"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "☃", ""]); + /// ``` + /// + /// Contiguous separators (commonly shows up with whitespace), can lead to + /// possibly surprising behavior. For example, this code is correct: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = " a b c"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); + /// ``` + /// + /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want + /// to match contiguous space characters: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r" +").unwrap(); + /// let hay = " a b c"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// // N.B. This does still include a leading empty span because ' +' + /// // matches at the beginning of the haystack. + /// assert_eq!(got, vec!["", "a", "b", "c"]); + /// ``` + #[inline] pub fn split<'r, 'h>(&'r self, haystack: &'h str) -> Split<'r, 'h> { Split { haystack, finder: self.find_iter(haystack), last: 0 } } + /// Returns an iterator of at most `limit` substrings of the haystack + /// given, delimited by a match of the regex. (A `limit` of `0` will return + /// no substrings.) Namely, each element of the iterator corresponds to a + /// part of the haystack that *isn't* matched by the regular expression. + /// The remainder of the haystack that is not split will be the last + /// element in the iterator. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// Although note that the worst case time here has an upper bound given + /// by the `limit` parameter. + /// + /// # Example + /// + /// Get the first two words in some haystack: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\W+").unwrap(); + /// let hay = "Hey! How are you?"; + /// let fields: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); + /// ``` + /// + /// # Examples: more cases + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = "Mary had a little lamb"; + /// let got: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = ""; + /// let got: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec![""]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "lionXXtigerXleopard"; + /// let got: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]); + /// + /// let re = Regex::new(r"::").unwrap(); + /// let hay = "lion::tiger::leopard"; + /// let got: Vec<&str> = re.splitn(hay, 2).collect(); + /// assert_eq!(got, vec!["lion", "tiger::leopard"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "abcXdef"; + /// let got: Vec<&str> = re.splitn(hay, 1).collect(); + /// assert_eq!(got, vec!["abcXdef"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "abcdef"; + /// let got: Vec<&str> = re.splitn(hay, 2).collect(); + /// assert_eq!(got, vec!["abcdef"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "abcXdef"; + /// let got: Vec<&str> = re.splitn(hay, 0).collect(); + /// assert!(got.is_empty()); + /// ``` + #[inline] pub fn splitn<'r, 'h>( &'r self, haystack: &'h str, @@ -68,6 +612,109 @@ impl Regex { SplitN { splits: self.split(haystack), limit } } + /// Replaces the leftmost-first match in the given haystack with the + /// replacement provided. The replacement can be a regular string (where + /// `$N` and `$name` are expanded to match capture groups) or a function + /// that takes a [`Captures`] and returns the replaced string. + /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// # Replacement string syntax + /// + /// All instances of `$ref` in the replacement string are replaced with + /// the substring corresponding to the capture group identified by `ref`. + /// + /// `ref` may be an integer corresponding to the index of the capture group + /// (counted by order of opening parenthesis where `0` is the entire match) + /// or it can be a name (consisting of letters, digits or underscores) + /// corresponding to a named capture group. + /// + /// If `ref` isn't a valid capture group (whether the name doesn't exist or + /// isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. For example, `$1a` looks up the + /// capture group named `1a` and not the capture group at index `1`. To + /// exert more precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Example + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal string: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"[^01]+").unwrap(); + /// assert_eq!(re.replace("1078910", ""), "1010"); + /// ``` + /// + /// But anything satisfying the [`Replacer`] trait will work. For example, + /// a closure of type `|&Captures| -> String` provides direct access to the + /// captures corresponding to a match. This allows one to access capturing + /// group matches easily: + /// + /// ``` + /// use regex_lite::{Captures, Regex}; + /// + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { + /// format!("{} {}", &caps[2], &caps[1]) + /// }); + /// assert_eq!(result, "Bruce Springsteen"); + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported (as described above) that expands `$name` into the + /// corresponding capture group. Here's the last example, but using this + /// expansion technique with named capture groups: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", "$first $last"); + /// assert_eq!(result, "Bruce Springsteen"); + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement when it is adjacent to some other + /// literal text. For example, if we wanted to join two words together with + /// an underscore: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap(); + /// let result = re.replace("deep fried", "${first}_$second"); + /// assert_eq!(result, "deep_fried"); + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// string with [`NoExpand`]: + /// + /// ``` + /// use regex_lite::{NoExpand, Regex}; + /// + /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); + /// assert_eq!(result, "$2 $last"); + /// ``` + /// + /// Using `NoExpand` may also be faster, since the replacement string won't + /// need to be parsed for the `$` syntax. + #[inline] pub fn replace<'h, R: Replacer>( &self, haystack: &'h str, @@ -76,6 +723,90 @@ impl Regex { self.replacen(haystack, 1, rep) } + /// Replaces all non-overlapping matches in the haystack with the + /// replacement provided. This is the same as calling `replacen` with + /// `limit` set to `0`. + /// + /// The documentation for [`Regex::replace`] goes into more detail about + /// what kinds of replacement strings are supported. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// # Fallibility + /// + /// If you need to write a replacement routine where any individual + /// replacement might "fail," doing so with this API isn't really feasible + /// because there's no way to stop the search process if a replacement + /// fails. Instead, if you need this functionality, you should consider + /// implementing your own replacement routine: + /// + /// ``` + /// use regex_lite::{Captures, Regex}; + /// + /// fn replace_all<E>( + /// re: &Regex, + /// haystack: &str, + /// replacement: impl Fn(&Captures) -> Result<String, E>, + /// ) -> Result<String, E> { + /// let mut new = String::with_capacity(haystack.len()); + /// let mut last_match = 0; + /// for caps in re.captures_iter(haystack) { + /// let m = caps.get(0).unwrap(); + /// new.push_str(&haystack[last_match..m.start()]); + /// new.push_str(&replacement(&caps)?); + /// last_match = m.end(); + /// } + /// new.push_str(&haystack[last_match..]); + /// Ok(new) + /// } + /// + /// // Let's replace each word with the number of bytes in that word. + /// // But if we see a word that is "too long," we'll give up. + /// let re = Regex::new(r"\w+").unwrap(); + /// let replacement = |caps: &Captures| -> Result<String, &'static str> { + /// if caps[0].len() >= 5 { + /// return Err("word too long"); + /// } + /// Ok(caps[0].len().to_string()) + /// }; + /// assert_eq!( + /// Ok("2 3 3 3?".to_string()), + /// replace_all(&re, "hi how are you?", &replacement), + /// ); + /// assert!(replace_all(&re, "hi there", &replacement).is_err()); + /// ``` + /// + /// # Example + /// + /// This example shows how to flip the order of whitespace delimited + /// fields, and normalizes the whitespace that delimits the fields: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(?m)^(\S+)\s+(\S+)$").unwrap(); + /// let hay = " + /// Greetings 1973 + /// Wild\t1973 + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "; + /// let new = re.replace_all(hay, "$2 $1"); + /// assert_eq!(new, " + /// 1973 Greetings + /// 1973 Wild + /// 1975 BornToRun + /// 1978 Darkness + /// 1980 TheRiver + /// "); + /// ``` + #[inline] pub fn replace_all<'h, R: Replacer>( &self, haystack: &'h str, @@ -84,6 +815,56 @@ impl Regex { self.replacen(haystack, 0, rep) } + /// Replaces at most `limit` non-overlapping matches in the haystack with + /// the replacement provided. If `limit` is `0`, then all non-overlapping + /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is + /// equivalent to `Regex::replacen(hay, 0, rep)`. + /// + /// The documentation for [`Regex::replace`] goes into more detail about + /// what kinds of replacement strings are supported. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// Although note that the worst case time here has an upper bound given + /// by the `limit` parameter. + /// + /// # Fallibility + /// + /// See the corresponding section in the docs for [`Regex::replace_all`] + /// for tips on how to deal with a replacement routine that can fail. + /// + /// # Example + /// + /// This example shows how to flip the order of whitespace delimited + /// fields, and normalizes the whitespace that delimits the fields. But we + /// only do it for the first two matches. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(?m)^(\S+)\s+(\S+)$").unwrap(); + /// let hay = " + /// Greetings 1973 + /// Wild\t1973 + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "; + /// let new = re.replacen(hay, 2, "$2 $1"); + /// assert_eq!(new, " + /// 1973 Greetings + /// 1973 Wild + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "); + /// ``` + #[inline] pub fn replacen<'h, R: Replacer>( &self, haystack: &'h str, @@ -141,45 +922,75 @@ impl Regex { } } -/// Advanced or "lower level" search methods. +/// A group of advanced or "lower level" search methods. Some methods permit +/// starting the search at a position greater than `0` in the haystack. Other +/// methods permit reusing allocations, for example, when extracting the +/// matches for capture groups. impl Regex { - /// Returns the end location of a match in the haystack given. + /// Returns the end byte offset of the first match in the haystack given. /// /// This method may have the same performance characteristics as - /// `is_match`, except it provides an end location for a match. In - /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match that you would find via `Regex::find`. + /// `is_match`. Behaviorlly, it doesn't just report whether it match + /// occurs, but also the end offset for a match. In particular, the offset + /// returned *may be shorter* than the proper end of the leftmost-first + /// match that you would find via [`Regex::find`]. /// /// Note that it is not guaranteed that this routine finds the shortest or /// "earliest" possible match. Instead, the main idea of this API is that /// it returns the offset at the point at which the internal regex engine /// has determined that a match has occurred. This may vary depending on /// which internal regex engine is used, and thus, the offset itself may - /// change. + /// change based on internal heuristics. /// /// # Example /// /// Typically, `a+` would match the entire first sequence of `a` in some - /// haystack, but `shortest_match` can give up as soon as it sees the first - /// `a`. + /// haystack, but `shortest_match` *may* give up as soon as it sees the + /// first `a`. /// /// ``` - /// /// use regex_lite::Regex; - /// let haystack = "aaaaa"; - /// let pos = Regex::new(r"a+").unwrap().shortest_match(haystack); - /// assert_eq!(pos, Some(1)); + /// + /// let re = Regex::new(r"a+").unwrap(); + /// let offset = re.shortest_match("aaaaa").unwrap(); + /// assert_eq!(offset, 1); /// ``` + #[inline] pub fn shortest_match(&self, haystack: &str) -> Option<usize> { self.shortest_match_at(haystack, 0) } - /// Returns the same as `shortest_match`, but starts the search at the - /// given offset. + /// Returns the same as [`Regex::shortest_match`], but starts the search at + /// the given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only match /// when `start == 0`. + /// + /// If a match is found, the offset returned is relative to the beginning + /// of the haystack, not the beginning of the search. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(re.shortest_match(&hay[2..]), Some(4)); + /// // No match because the assertions take the context into account. + /// assert_eq!(re.shortest_match_at(hay, 2), None); + /// ``` + #[inline] pub fn shortest_match_at( &self, haystack: &str, @@ -201,12 +1012,34 @@ impl Regex { Some(slots[1].unwrap().get()) } - /// Returns the same as is_match, but starts the search at the given - /// offset. + /// Returns the same as [`Regex::is_match`], but starts the search at the + /// given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert!(re.is_match(&hay[2..])); + /// // No match because the assertions take the context into account. + /// assert!(!re.is_match_at(hay, 2)); + /// ``` + #[inline] pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { let mut cache = self.pool.get(); self.pikevm.search( @@ -219,12 +1052,34 @@ impl Regex { ) } - /// Returns the same as find, but starts the search at the given + /// Returns the same as [`Regex::find`], but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4)); + /// // No match because the assertions take the context into account. + /// assert_eq!(re.find_at(hay, 2), None); + /// ``` + #[inline] pub fn find_at<'h>( &self, haystack: &'h str, @@ -253,6 +1108,27 @@ impl Regex { /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], "chew"); + /// // No match because the assertions take the context into account. + /// assert!(re.captures_at(hay, 2).is_none()); + /// ``` #[inline] pub fn captures_at<'h>( &self, @@ -279,16 +1155,39 @@ impl Regex { Some(caps) } - /// This is like `captures`, but uses - /// [`CaptureLocations`](struct.CaptureLocations.html) - /// instead of - /// [`Captures`](struct.Captures.html) in order to amortize allocations. + /// This is like [`Regex::captures`], but writes the byte offsets of each + /// capture group match into the locations given. + /// + /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], + /// but does *not* store a reference to the haystack. This makes its API + /// a bit lower level and less convenience. But in exchange, callers + /// may allocate their own `CaptureLocations` and reuse it for multiple + /// searches. This may be helpful if allocating a `Captures` shows up in a + /// profile as too costly. /// /// To create a `CaptureLocations` value, use the - /// `Regex::capture_locations` method. + /// [`Regex::capture_locations`] method. + /// + /// This also the overall match if one was found. When a match is found, + /// its offsets are also always stored in `locs` at index `0`. + /// + /// # Panics + /// + /// This routine may panic if the given `CaptureLocations` was not created + /// by this regex. /// - /// This returns the overall match if this was successful, which is always - /// equivalence to the `0`th capture group. + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert!(re.captures_read(&mut locs, "id=foo123").is_some()); + /// assert_eq!(Some((0, 9)), locs.get(0)); + /// assert_eq!(Some((0, 2)), locs.get(1)); + /// assert_eq!(Some((3, 9)), locs.get(2)); + /// ``` #[inline] pub fn captures_read<'h>( &self, @@ -298,12 +1197,37 @@ impl Regex { self.captures_read_at(locs, haystack, 0) } - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. + /// Returns the same as [`Regex::captures_read`], but starts the search at + /// the given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// This routine may also panic if the given `CaptureLocations` was not + /// created by this regex. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// let mut locs = re.capture_locations(); + /// // We get a match here, but it's probably not intended. + /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some()); + /// // No match because the assertions take the context into account. + /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none()); + /// ``` #[inline] pub fn captures_read_at<'h>( &self, @@ -326,77 +1250,230 @@ impl Regex { let (start, end) = locs.get(0).unwrap(); Some(Match::new(haystack, start, end)) } - - /// An undocumented alias for `captures_read_at`. - /// - /// The `regex-capi` crate previously used this routine, so to avoid - /// breaking that crate, we continue to provide the name as an undocumented - /// alias. - #[doc(hidden)] - #[inline] - pub fn read_captures_at<'h>( - &self, - locs: &mut CaptureLocations, - haystack: &'h str, - start: usize, - ) -> Option<Match<'h>> { - self.captures_read_at(locs, haystack, start) - } } /// Auxiliary methods. impl Regex { /// Returns the original string of this regex. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"foo\w+bar").unwrap(); + /// assert_eq!(re.as_str(), r"foo\w+bar"); + /// ``` + #[inline] + pub fn as_str(&self) -> &str { + &self.pikevm.nfa().pattern() + } + + /// Returns an iterator over the capture names in this regex. + /// + /// The iterator returned yields elements of type `Option<&str>`. That is, + /// the iterator yields values for all capture groups, even ones that are + /// unnamed. The order of the groups corresponds to the order of the group's + /// corresponding opening parenthesis. + /// + /// The first element of the iterator always yields the group corresponding + /// to the overall match, and this group is always unnamed. Therefore, the + /// iterator always yields at least one group. + /// + /// # Example + /// + /// This shows basic usage with a mix of named and unnamed capture groups: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), Some(Some("a"))); + /// assert_eq!(names.next(), Some(Some("b"))); + /// assert_eq!(names.next(), Some(None)); + /// // the '(?:.)' group is non-capturing and so doesn't appear here! + /// assert_eq!(names.next(), Some(Some("c"))); + /// assert_eq!(names.next(), None); + /// ``` + /// + /// The iterator always yields at least one element, even for regexes with + /// no capture groups and even for regexes that can never match: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), None); + /// + /// let re = Regex::new(r"[^\s\S]").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), None); + /// ``` + #[inline] + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.pikevm.nfa().capture_names()) + } + + /// Returns the number of captures groups in this regex. + /// + /// This includes all named and unnamed groups, including the implicit + /// unnamed group that is always present and corresponds to the entire + /// match. + /// + /// Since the implict unnamed group is always included in this length, the + /// length returned is guaranteed to be greater than zero. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"foo").unwrap(); + /// assert_eq!(1, re.captures_len()); + /// + /// let re = Regex::new(r"(foo)").unwrap(); + /// assert_eq!(2, re.captures_len()); + /// + /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); + /// assert_eq!(5, re.captures_len()); + /// + /// let re = Regex::new(r"[^\s\S]").unwrap(); + /// assert_eq!(1, re.captures_len()); + /// ``` + #[inline] + pub fn captures_len(&self) -> usize { + self.pikevm.nfa().group_len() + } + + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` #[inline] - pub fn as_str(&self) -> &str { - &self.pikevm.nfa().pattern() - } - - /// Returns an iterator over the capture names. - pub fn capture_names(&self) -> CaptureNames<'_> { - CaptureNames(self.pikevm.nfa().capture_names()) - } - - /// Returns the number of captures. - pub fn captures_len(&self) -> usize { - self.pikevm.nfa().group_len() + pub fn static_captures_len(&self) -> Option<usize> { + self.pikevm + .nfa() + .static_explicit_captures_len() + .map(|len| len.saturating_add(1)) } - /// Returns an empty set of capture locations that can be reused in - /// multiple calls to `captures_read` or `captures_read_at`. + /// Returns a fresh allocated set of capture locations that can + /// be reused in multiple calls to [`Regex::captures_read`] or + /// [`Regex::captures_read_at`]. + /// + /// The returned locations can be used for any subsequent search for this + /// particular regex. There is no guarantee that it is correct to use for + /// other regexes, even if they have the same number of capture groups. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(.)(.)(\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert!(re.captures_read(&mut locs, "Padron").is_some()); + /// assert_eq!(locs.get(0), Some((0, 6))); + /// assert_eq!(locs.get(1), Some((0, 1))); + /// assert_eq!(locs.get(2), Some((1, 2))); + /// assert_eq!(locs.get(3), Some((2, 6))); + /// ``` #[inline] pub fn capture_locations(&self) -> CaptureLocations { // OK because NFA construction would have failed if this overflowed. let len = self.pikevm.nfa().group_len().checked_mul(2).unwrap(); CaptureLocations(vec![None; len]) } - - /// An alias for `capture_locations` to preserve backward compatibility. - /// - /// The `regex-capi` crate uses this method, so to avoid breaking that - /// crate, we continue to export it as an undocumented API. - #[doc(hidden)] - #[inline] - pub fn locations(&self) -> CaptureLocations { - self.capture_locations() - } -} - -impl Clone for Regex { - fn clone(&self) -> Regex { - let pikevm = Arc::clone(&self.pikevm); - let pool = { - let pikevm = Arc::clone(&self.pikevm); - let create = Box::new(move || Cache::new(&pikevm)); - CachePool::new(create) - }; - Regex { pikevm, pool } - } } -/// Match represents a single match of a regex in a haystack. +/// Represents a single match of a regex in a haystack. +/// +/// A `Match` contains both the start and end byte offsets of the match and the +/// actual substring corresponding to the range of those byte offsets. It is +/// guaranteed that `start <= end`. When `start == end`, the match is empty. +/// +/// Since this `Match` can only be produced by the top-level `Regex` APIs +/// that only support searching UTF-8 encoded strings, the byte offsets for a +/// `Match` are guaranteed to fall on valid UTF-8 codepoint boundaries. That +/// is, slicing a `&str` with [`Match::range`] is guaranteed to never panic. +/// +/// Values with this type are created by [`Regex::find`] or +/// [`Regex::find_iter`]. Other APIs can create `Match` values too. For +/// example, [`Captures::get`]. +/// +/// The lifetime parameter `'h` refers to the lifetime of the matched of the +/// haystack that this match was produced from. +/// +/// # Numbering +/// +/// The byte offsets in a `Match` form a half-open interval. That is, the +/// start of the range is inclusive and the end of the range is exclusive. +/// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte +/// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and +/// `6` corresponds to `x`, which is one past the end of the match. This +/// corresponds to the same kind of slicing that Rust uses. +/// +/// For more on why this was chosen over other schemes (aside from being +/// consistent with how Rust the language works), see [this discussion] and +/// [Dijkstra's note on a related topic][note]. +/// +/// [this discussion]: https://github.com/rust-lang/regex/discussions/866 +/// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html +/// +/// # Example /// -/// The lifetime parameter `'h` refers to the lifetime of the haystack. +/// This example shows the value of each of the methods on `Match` for a +/// particular search. +/// +/// ``` +/// use regex_lite::Regex; +/// +/// let re = Regex::new(r"\d+").unwrap(); +/// let hay = "numbers: 1234"; +/// let m = re.find(hay).unwrap(); +/// assert_eq!(9, m.start()); +/// assert_eq!(13, m.end()); +/// assert!(!m.is_empty()); +/// assert_eq!(4, m.len()); +/// assert_eq!(9..13, m.range()); +/// assert_eq!("1234", m.as_str()); +/// ``` #[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'h> { haystack: &'h str, @@ -406,23 +1483,48 @@ pub struct Match<'h> { impl<'h> Match<'h> { /// Creates a new match from the given haystack and byte offsets. + #[inline] fn new(haystack: &'h str, start: usize, end: usize) -> Match<'h> { Match { haystack, start, end } } - /// Returns the starting byte offset of the match in the haystack. + /// Returns the byte offset of the start of the match in the haystack. The + /// start of the match corresponds to the position where the match begins + /// and includes the first byte in the match. + /// + /// It is guaranteed that `Match::start() <= Match::end()`. + /// + /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That + /// is, it will never be an offset that appears between the UTF-8 code + /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is + /// always safe to slice the corresponding haystack using this offset. #[inline] pub fn start(&self) -> usize { self.start } - /// Returns the ending byte offset of the match in the haystack. + /// Returns the byte offset of the end of the match in the haystack. The + /// end of the match corresponds to the byte immediately following the last + /// byte in the match. This means that `&slice[start..end]` works as one + /// would expect. + /// + /// It is guaranteed that `Match::start() <= Match::end()`. + /// + /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That + /// is, it will never be an offset that appears between the UTF-8 code + /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is + /// always safe to slice the corresponding haystack using this offset. #[inline] pub fn end(&self) -> usize { self.end } /// Returns true if and only if this match has a length of zero. + /// + /// Note that an empty match can only occur when the regex itself can + /// match the empty string. Here are some examples of regexes that can + /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`, + /// `(foo|\d+|quux)?`. #[inline] pub fn is_empty(&self) -> bool { self.start == self.end @@ -436,24 +1538,28 @@ impl<'h> Match<'h> { /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. + /// + /// It is always correct to slice the original haystack searched with this + /// range. That is, because the offsets are guaranteed to fall on valid + /// UTF-8 boundaries, the range returned is always valid. #[inline] pub fn range(&self) -> core::ops::Range<usize> { self.start..self.end } - /// Returns the matched portion of the haystack. + /// Returns the substring of the haystack that matched. #[inline] pub fn as_str(&self) -> &'h str { &self.haystack[self.range()] } } -impl<'h> std::fmt::Debug for Match<'h> { +impl<'h> core::fmt::Debug for Match<'h> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { f.debug_struct("Match") .field("start", &self.start) .field("end", &self.end) - .field("haystack", &self.as_str()) + .field("string", &self.as_str()) .finish() } } @@ -470,17 +1576,52 @@ impl<'h> From<Match<'h>> for core::ops::Range<usize> { } } -/// Captures represents a group of captured strings for a single match. +/// Represents the capture groups for a single match. +/// +/// Capture groups refer to parts of a regex enclosed in parentheses. They can +/// be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. For +/// example, say you want to match the individual letters in a 5-letter word: +/// +/// ```text +/// (?<first>\w)(\w)(?:\w)\w(?<last>\w) +/// ``` +/// +/// This regex has 4 capture groups: /// -/// The 0th capture always corresponds to the entire match. Each subsequent -/// index corresponds to the next capture group in the regex. If a capture -/// group is named, then the matched string is *also* available via the `name` -/// method. (Note that the 0th capture is always unnamed and so must be -/// accessed with the `get` method.) +/// * The group at index `0` corresponds to the overall match. It is always +/// present in every match and never has a name. +/// * The group at index `1` with name `first` corresponding to the first +/// letter. +/// * The group at index `2` with no name corresponding to the second letter. +/// * The group at index `3` with name `last` corresponding to the fifth and +/// last letter. /// -/// Positions returned from a capture group are always byte indices. +/// Notice that `(?:\w)` was not listed above as a capture group despite it +/// being enclosed in parentheses. That's because `(?:pattern)` is a special +/// syntax that permits grouping but *without* capturing. The reason for not +/// treating it as a capture is that tracking and reporting capture groups +/// requires additional state that may lead to slower searches. So using as few +/// capture groups as possible can help performance. (Although the difference +/// in performance of a couple of capture groups is likely immaterial.) /// -/// `'h` is the lifetime of the matched haystack. +/// Values with this type are created by [`Regex::captures`] or +/// [`Regex::captures_iter`]. +/// +/// `'h` is the lifetime of the haystack that these captures were matched from. +/// +/// # Example +/// +/// ``` +/// use regex_lite::Regex; +/// +/// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap(); +/// let caps = re.captures("toady").unwrap(); +/// assert_eq!("toady", &caps[0]); +/// assert_eq!("t", &caps["first"]); +/// assert_eq!("o", &caps[2]); +/// assert_eq!("y", &caps["last"]); +/// ``` pub struct Captures<'h> { haystack: &'h str, slots: CaptureLocations, @@ -491,73 +1632,215 @@ pub struct Captures<'h> { } impl<'h> Captures<'h> { - /// Returns the match associated with the capture group at index `i`. If - /// `i` does not correspond to a capture group, or if the capture group - /// did not participate in the match, then `None` is returned. + /// Returns the `Match` associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group did + /// not participate in the match, then `None` is returned. + /// + /// When `i == 0`, this is guaranteed to return a non-`None` value. /// /// # Examples /// - /// Get the haystack of the match with a default of an empty string if this + /// Get the substring that matched with a default of an empty string if the /// group didn't participate in the match: /// - /// ```rust + /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); /// let caps = re.captures("abc123").unwrap(); /// - /// let hay1 = caps.get(1).map_or("", |m| m.as_str()); - /// let hay2 = caps.get(2).map_or("", |m| m.as_str()); - /// assert_eq!(hay1, "123"); - /// assert_eq!(hay2, ""); + /// let substr1 = caps.get(1).map_or("", |m| m.as_str()); + /// let substr2 = caps.get(2).map_or("", |m| m.as_str()); + /// assert_eq!(substr1, "123"); + /// assert_eq!(substr2, ""); /// ``` #[inline] pub fn get(&self, i: usize) -> Option<Match<'h>> { self.slots.get(i).map(|(s, e)| Match::new(self.haystack, s, e)) } - /// Returns the match for the capture group named `name`. If `name` isn't a - /// valid capture group or didn't match anything, then `None` is returned. + /// Returns the `Match` associated with the capture group named `name`. If + /// `name` isn't a valid capture group or it refers to a group that didn't + /// match, then `None` is returned. + /// + /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime + /// matches the lifetime of the haystack in this `Captures` value. + /// Conversely, the substring returned by `caps["name"]` has a lifetime + /// of the `Captures` value, which is likely shorter than the lifetime of + /// the haystack. In some cases, it may be necessary to use this method to + /// access the matching substring instead of the `caps["name"]` notation. + /// + /// # Examples + /// + /// Get the substring that matched with a default of an empty string if the + /// group didn't participate in the match: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new( + /// r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))", + /// ).unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let numbers = caps.name("numbers").map_or("", |m| m.as_str()); + /// let letters = caps.name("letters").map_or("", |m| m.as_str()); + /// assert_eq!(numbers, "123"); + /// assert_eq!(letters, ""); + /// ``` #[inline] pub fn name(&self, name: &str) -> Option<Match<'h>> { let i = self.pikevm.nfa().to_index(name)?; self.get(i) } - /// An iterator that yields all capturing matches in the order in which - /// they appear in the regex. If a particular capture group didn't - /// participate in the match, then `None` is yielded for that capture. + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups. /// - /// The first match always corresponds to the overall match of the regex. - #[inline] - pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> { - SubCaptureMatches { - caps: self, - it: self.pikevm.nfa().capture_names().enumerate(), - } - } - - /// Expands all instances of `$name` in `replacement` to the corresponding - /// capture group `name`, and writes them to the `dst` buffer given. + /// This returns a tuple where the first element corresponds to the full + /// substring of the haystack that matched the regex. The second element is + /// an array of substrings, with each corresponding to the to the substring + /// that matched for a particular capture group. /// - /// `name` may be an integer corresponding to the index of the capture - /// group (counted by order of opening parenthesis where `0` is the - /// entire match) or it can be a name (consisting of letters, digits or - /// underscores) corresponding to a named capture group. + /// # Panics /// - /// If `name` isn't a valid capture group (whether the name doesn't exist - /// or isn't a valid index), then it is replaced with the empty string. + /// This panics if the number of possible matching groups in this + /// `Captures` value is not fixed to `N` in all circumstances. + /// More precisely, this routine only works when `N` is equivalent to + /// [`Regex::static_captures_len`]. /// - /// The longest possible name consisting of the characters `[_0-9A-Za-z]` - /// is used. e.g., `$1a` looks up the capture group named `1a` and not the - /// capture group at index `1`. To exert more precise control over the - /// name, or to refer to a capture group name that uses characters outside - /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When - /// using braces, any sequence of characters is permitted. If the sequence - /// does not refer to a capture group name in the corresponding regex, then - /// it is replaced with an empty string. + /// Stated more plainly, if the number of matching capture groups in a + /// regex can vary from match to match, then this function always panics. /// - /// To write a literal `$` use `$$`. + /// For example, `(a)(b)|(c)` could produce two matching capture groups + /// or one matching capture group for any given match. Therefore, one + /// cannot use `extract` with such a pattern. + /// + /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because + /// the number of capture groups in every match is always equivalent, + /// even if the capture _indices_ in each match are not. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// let Some((full, [year, month, day])) = + /// re.captures(hay).map(|caps| caps.extract()) else { return }; + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// assert_eq!("03", month); + /// assert_eq!("14", day); + /// ``` + /// + /// # Example: iteration + /// + /// This example shows how to use this method when iterating over all + /// `Captures` matches in a haystack. + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); + /// let hay = "1973-01-05, 1975-08-25 and 1980-10-18"; + /// + /// let mut dates: Vec<(&str, &str, &str)> = vec![]; + /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) { + /// dates.push((y, m, d)); + /// } + /// assert_eq!(dates, vec![ + /// ("1973", "01", "05"), + /// ("1975", "08", "25"), + /// ("1980", "10", "18"), + /// ]); + /// ``` + /// + /// # Example: parsing different formats + /// + /// This API is particularly useful when you need to extract a particular + /// value that might occur in a different format. Consider, for example, + /// an identifier that might be in double quotes or single quotes: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap(); + /// let hay = r#"The first is id:"foo" and the second is id:'bar'."#; + /// let mut ids = vec![]; + /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) { + /// ids.push(id); + /// } + /// assert_eq!(ids, vec!["foo", "bar"]); + /// ``` + pub fn extract<const N: usize>(&self) -> (&'h str, [&'h str; N]) { + let len = self + .pikevm + .nfa() + .static_explicit_captures_len() + .expect("number of capture groups can vary in a match"); + assert_eq!(N, len, "asked for {} groups, but must ask for {}", N, len); + let mut matched = self.iter().flatten(); + let whole_match = matched.next().expect("a match").as_str(); + let group_matches = [0; N].map(|_| { + matched.next().expect("too few matching groups").as_str() + }); + (whole_match, group_matches) + } + + /// Expands all instances of `$ref` in `replacement` to the corresponding + /// capture group, and writes them to the `dst` buffer given. A `ref` can + /// be a capture group index or a name. If `ref` doesn't refer to a capture + /// group that participated in the match, then it is replaced with the + /// empty string. + /// + /// # Format + /// + /// The format of the replacement string supports two different kinds of + /// capture references: unbraced and braced. + /// + /// For the unbraced format, the format supported is `$ref` where `name` + /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always + /// the longest possible parse. So for example, `$1a` corresponds to the + /// capture group named `1a` and not the capture group at index `1`. If + /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index + /// itself and not a name. + /// + /// For the braced format, the format supported is `${ref}` where `ref` can + /// be any sequence of bytes except for `}`. If no closing brace occurs, + /// then it is not considered a capture reference. As with the unbraced + /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture + /// group index and not a name. + /// + /// The braced format is useful for exerting precise control over the name + /// of the capture reference. For example, `${1}a` corresponds to the + /// capture group reference `1` followed by the letter `a`, where as `$1a` + /// (as mentioned above) corresponds to the capture group reference `1a`. + /// The braced format is also useful for expressing capture group names + /// that use characters not supported by the unbraced format. For example, + /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`. + /// + /// If a capture group reference is found and it does not refer to a valid + /// capture group, then it will be replaced with the empty string. + /// + /// To write a literal `$`, use `$$`. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new( + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// ).unwrap(); + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// let caps = re.captures(hay).unwrap(); + /// + /// let mut dst = String::new(); + /// caps.expand("year=$year, month=$month, day=$day", &mut dst); + /// assert_eq!(dst, "year=2010, month=03, day=14"); + /// ``` #[inline] pub fn expand(&self, replacement: &str, dst: &mut String) { interpolate::string( @@ -574,10 +1857,56 @@ impl<'h> Captures<'h> { ); } - /// Returns the total number of capture groups (even if they didn't match). + /// Returns an iterator over all capture groups. This includes both + /// matching and non-matching groups. + /// + /// The iterator always yields at least one matching group: the first group + /// (at index `0`) with no name. Subsequent groups are returned in the order + /// of their opening parenthesis in the regex. + /// + /// The elements yielded have type `Option<Match<'h>>`, where a non-`None` + /// value is present if the capture group matches. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); + /// let caps = re.captures("AZ").unwrap(); + /// + /// let mut it = caps.iter(); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("AZ")); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("A")); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), None); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("Z")); + /// assert_eq!(it.next(), None); + /// ``` + #[inline] + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> { + SubCaptureMatches { + caps: self, + it: self.pikevm.nfa().capture_names().enumerate(), + } + } + + /// Returns the total number of capture groups. This includes both + /// matching and non-matching groups. + /// + /// The length returned is always equivalent to the number of elements + /// yielded by [`Captures::iter`]. Consequently, the length is always + /// greater than zero since every `Captures` value always includes the + /// match for the entire regex. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; /// - /// This is always at least `1`, since every regex has at least one capture - /// group that corresponds to the full match. + /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); + /// let caps = re.captures("AZ").unwrap(); + /// assert_eq!(caps.len(), 4); + /// ``` #[inline] pub fn len(&self) -> usize { self.pikevm.nfa().group_len() @@ -586,25 +1915,83 @@ impl<'h> Captures<'h> { impl<'h> core::fmt::Debug for Captures<'h> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - // TODO: Make this better. - f.debug_tuple("Captures").field(&self.slots).finish() + /// A little helper type to provide a nice map-like debug + /// representation for our capturing group spans. + /// + /// regex-automata has something similar, but it includes the pattern + /// ID in its debug output, which is confusing. It also doesn't include + /// that strings that match because a regex-automata `Captures` doesn't + /// borrow the haystack. + struct CapturesDebugMap<'a> { + caps: &'a Captures<'a>, + } + + impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut map = f.debug_map(); + let names = self.caps.pikevm.nfa().capture_names(); + for (group_index, maybe_name) in names.enumerate() { + let key = Key(group_index, maybe_name); + match self.caps.get(group_index) { + None => map.entry(&key, &None::<()>), + Some(mat) => map.entry(&key, &Value(mat)), + }; + } + map.finish() + } + } + + struct Key<'a>(usize, Option<&'a str>); + + impl<'a> core::fmt::Debug for Key<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.0)?; + if let Some(name) = self.1 { + write!(f, "/{:?}", name)?; + } + Ok(()) + } + } + + struct Value<'a>(Match<'a>); + + impl<'a> core::fmt::Debug for Value<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "{}..{}/{:?}", + self.0.start(), + self.0.end(), + self.0.as_str() + ) + } + } + + f.debug_tuple("Captures") + .field(&CapturesDebugMap { caps: self }) + .finish() } } -/// Get a group by index. +/// Get a matching capture group's haystack substring by index. /// -/// `'h` is the lifetime of the matched portion of the haystack. +/// The haystack substring returned can't outlive the `Captures` object if this +/// method is used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it). To work around this limitation, do that, use +/// [`Captures::get`] instead. /// -/// The haystack can't outlive the `Captures` object if this method is used, -/// because of how `Index` is defined (normally `a[i]` is part of `a` and can't -/// outlive it); to do that, use `get()` instead. +/// `'h` is the lifetime of the matched haystack, but the lifetime of the +/// `&str` returned by this implementation is the lifetime of the `Captures` +/// value itself. /// /// # Panics /// -/// If there is no group at the given index. +/// If there is no matching group at the given index. impl<'h> core::ops::Index<usize> for Captures<'h> { type Output = str; + // The lifetime is written out to make it clear that the &str returned + // does NOT have a lifetime equivalent to 'h. fn index(&self, i: usize) -> &str { self.get(i) .map(|m| m.as_str()) @@ -612,18 +1999,22 @@ impl<'h> core::ops::Index<usize> for Captures<'h> { } } -/// Get a group by name. +/// Get a matching capture group's haystack substring by name. /// -/// `'h` is the lifetime of the matched portion of the haystack and `'n` is the -/// lifetime of the group name that is used as the lookup key. +/// The haystack substring returned can't outlive the `Captures` object if this +/// method is used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it). To work around this limitation, do that, use +/// [`Captures::get`] instead. /// -/// The haystack can't outlive the `Captures` object if this method is used, -/// because of how `Index` is defined (normally `a[i]` is part of `a` and can't -/// outlive it); to do that, use `name` instead. +/// `'h` is the lifetime of the matched haystack, but the lifetime of the +/// `&str` returned by this implementation is the lifetime of the `Captures` +/// value itself. +/// +/// `'n` is the lifetime of the group name used to index the `Captures` value. /// /// # Panics /// -/// If there is no group named by the given value. +/// If there is no matching group at the given name. impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { type Output = str; @@ -634,59 +2025,20 @@ impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { } } -/// An iterator that yields all capturing matches in the order in which they -/// appear in the regex. -/// -/// If a particular capture group didn't participate in the match, then `None` -/// is yielded for that capture. The first match always corresponds to the -/// overall match of the regex. -/// -/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and -/// the lifetime `'h` corresponds to the originally matched haystack. -#[derive(Clone, Debug)] -pub struct SubCaptureMatches<'c, 'h> { - caps: &'c Captures<'h>, - it: core::iter::Enumerate<nfa::CaptureNames<'c>>, -} - -impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> { - type Item = Option<Match<'h>>; - - #[inline] - fn next(&mut self) -> Option<Option<Match<'h>>> { - let (group_index, _) = self.it.next()?; - Some(self.caps.get(group_index)) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.it.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.it.count() - } -} - -impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {} - -impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {} - -/// CaptureLocations is a low level representation of the raw offsets of each -/// submatch. +/// A low level representation of the byte offsets of each capture group. /// /// You can think of this as a lower level [`Captures`], where this type does /// not support named capturing groups directly and it does not borrow the /// haystack that these offsets were matched on. /// -/// Primarily, this type is useful when using the lower level `Regex` APIs -/// such as `read_captures`, which permits amortizing the allocation in which -/// capture match locations are stored. +/// Primarily, this type is useful when using the lower level `Regex` APIs such +/// as [`Regex::captures_read`], which permits amortizing the allocation in +/// which capture match offsets are stored. /// /// In order to build a value of this type, you'll need to call the -/// `capture_locations` method on the `Regex` being used to execute the search. -/// The value returned can then be reused in subsequent searches. +/// [`Regex::capture_locations`] method. The value returned can then be reused +/// in subsequent searches for that regex. Using it for other regexes may +/// result in a panic or otherwise incorrect results. /// /// # Example /// @@ -711,19 +2063,23 @@ impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {} #[derive(Clone, Debug)] pub struct CaptureLocations(Vec<Option<NonMaxUsize>>); -/// A type alias for `CaptureLocations` for backwards compatibility. -/// -/// Previously, we exported `CaptureLocations` as `Locations` in an -/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), -/// we continue re-exporting the same undocumented API. -#[doc(hidden)] -pub type Locations = CaptureLocations; - impl CaptureLocations { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original string matched. + /// Returns the start and end byte offsets of the capture group at index + /// `i`. This returns `None` if `i` is not a valid capture group or if the + /// capture group did not match. + /// + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); + /// assert_eq!(Some((0, 17)), locs.get(0)); + /// assert_eq!(Some((0, 5)), locs.get(1)); + /// assert_eq!(Some((6, 17)), locs.get(2)); + /// ``` #[inline] pub fn get(&self, i: usize) -> Option<(usize, usize)> { let slot = i.checked_mul(2)?; @@ -734,66 +2090,59 @@ impl CaptureLocations { } /// Returns the total number of capture groups (even if they didn't match). + /// That is, the length returned is unaffected by the result of a search. /// /// This is always at least `1` since every regex has at least `1` /// capturing group that corresponds to the entire match. - #[inline] - pub fn len(&self) -> usize { - // We always have twice as many slots as groups. - self.0.len().checked_shr(1).unwrap() - } - - /// An alias for the `get` method for backwards compatibility. /// - /// Previously, we exported `get` as `pos` in an undocumented API. To - /// prevent breaking that code (e.g., in `regex-capi`), we continue - /// re-exporting the same undocumented API. - #[doc(hidden)] - #[inline] - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - self.get(i) - } -} - -/// An iterator over the names of all possible captures. -/// -/// `None` indicates an unnamed capture; the first element (capture 0, the -/// whole matched region) is always unnamed. -/// -/// `'r` is the lifetime of the compiled regular expression. -#[derive(Clone, Debug)] -pub struct CaptureNames<'r>(nfa::CaptureNames<'r>); - -impl<'r> Iterator for CaptureNames<'r> { - type Item = Option<&'r str>; - - #[inline] - fn next(&mut self) -> Option<Option<&'r str>> { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.0.size_hint() - } - + /// # Example + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert_eq!(3, locs.len()); + /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); + /// assert_eq!(3, locs.len()); + /// ``` + /// + /// Notice that the length is always at least `1`, regardless of the regex: + /// + /// ``` + /// use regex_lite::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let locs = re.capture_locations(); + /// assert_eq!(1, locs.len()); + /// + /// // [a&&b] is a regex that never matches anything. + /// let re = Regex::new(r"[^\s\S]").unwrap(); + /// let locs = re.capture_locations(); + /// assert_eq!(1, locs.len()); + /// ``` #[inline] - fn count(self) -> usize { - self.0.count() + pub fn len(&self) -> usize { + // We always have twice as many slots as groups. + self.0.len().checked_shr(1).unwrap() } } -impl<'r> ExactSizeIterator for CaptureNames<'r> {} - -impl<'r> core::iter::FusedIterator for CaptureNames<'r> {} - -/// An iterator over all non-overlapping matches for a particular string. +/// An iterator over all non-overlapping matches in a haystack. /// -/// The iterator yields a `Match` value. The iterator stops when no more +/// This iterator yields [`Match`] values. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the -/// lifetime of the matched string. +/// lifetime of the haystack. +/// +/// This iterator is created by [`Regex::find_iter`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. #[derive(Debug)] pub struct Matches<'r, 'h> { haystack: &'h str, @@ -816,13 +2165,21 @@ impl<'r, 'h> Iterator for Matches<'r, 'h> { impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {} -/// An iterator that yields all non-overlapping capture groups matching a -/// particular regular expression. +/// An iterator over all non-overlapping capture matches in a haystack. /// -/// The iterator stops when no more matches can be found. +/// This iterator yields [`Captures`] values. The iterator stops when no more +/// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the /// lifetime of the matched string. +/// +/// This iterator is created by [`Regex::captures_iter`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. #[derive(Debug)] pub struct CaptureMatches<'r, 'h> { haystack: &'h str, @@ -850,10 +2207,18 @@ impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> { impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {} -/// Yields all substrings delimited by a regular expression match. +/// An iterator over all substrings delimited by a regex match. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the -/// lifetime of the string being split. +/// lifetime of the byte string being split. +/// +/// This iterator is created by [`Regex::split`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. #[derive(Debug)] pub struct Split<'r, 'h> { haystack: &'h str, @@ -878,7 +2243,7 @@ impl<'r, 'h> Iterator for Split<'r, 'h> { } } Some(m) => { - let range = m.range(); + let range = self.last..m.start(); self.last = m.end(); Some(&self.haystack[range]) } @@ -888,12 +2253,24 @@ impl<'r, 'h> Iterator for Split<'r, 'h> { impl<'r, 't> core::iter::FusedIterator for Split<'r, 't> {} -/// Yields at most `N` substrings delimited by a regular expression match. +/// An iterator over at most `N` substrings delimited by a regex match. /// -/// The last substring will be whatever remains after splitting. +/// The last substring yielded by this iterator will be whatever remains after +/// `N-1` splits. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the -/// lifetime of the string being split. +/// lifetime of the byte string being split. +/// +/// This iterator is created by [`Regex::splitn`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +/// +/// Although note that the worst case time here has an upper bound given +/// by the `limit` parameter to [`Regex::splitn`]. #[derive(Debug)] pub struct SplitN<'r, 'h> { splits: Split<'r, 'h>, @@ -932,34 +2309,143 @@ impl<'r, 'h> Iterator for SplitN<'r, 'h> { impl<'r, 't> core::iter::FusedIterator for SplitN<'r, 't> {} -/// Replacer describes types that can be used to replace matches in a string. +/// An iterator over the names of all capture groups in a regex. +/// +/// This iterator yields values of type `Option<&str>` in order of the opening +/// capture group parenthesis in the regex pattern. `None` is yielded for +/// groups with no name. The first element always corresponds to the implicit +/// and unnamed group for the overall match. +/// +/// `'r` is the lifetime of the compiled regular expression. +/// +/// This iterator is created by [`Regex::capture_names`]. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(nfa::CaptureNames<'r>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + #[inline] + fn next(&mut self) -> Option<Option<&'r str>> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> core::iter::FusedIterator for CaptureNames<'r> {} + +/// An iterator over all group matches in a [`Captures`] value. +/// +/// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the +/// lifetime of the haystack that the matches are for. The order of elements +/// yielded corresponds to the order of the opening parenthesis for the group +/// in the regex pattern. `None` is yielded for groups that did not participate +/// in the match. +/// +/// The first element always corresponds to the implicit group for the overall +/// match. Since this iterator is created by a [`Captures`] value, and a +/// `Captures` value is only created when a match occurs, it follows that the +/// first element yielded by this iterator is guaranteed to be non-`None`. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value that +/// created this iterator, and the lifetime `'h` corresponds to the originally +/// matched haystack. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 'h> { + caps: &'c Captures<'h>, + it: core::iter::Enumerate<nfa::CaptureNames<'c>>, +} + +impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> { + type Item = Option<Match<'h>>; + + #[inline] + fn next(&mut self) -> Option<Option<Match<'h>>> { + let (group_index, _) = self.it.next()?; + Some(self.caps.get(group_index)) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.it.count() + } +} + +impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {} + +impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {} + +/// A trait for types that can be used to replace matches in a haystack. /// /// In general, users of this crate shouldn't need to implement this trait, /// since implementations are already provided for `&str` along with other -/// variants of string types and `FnMut(&Captures) -> String` (or any -/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases. +/// variants of string types, as well as `FnMut(&Captures) -> String` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<str>`). Those cover most use cases, +/// but callers can implement this trait directly if necessary. +/// +/// # Example +/// +/// This example shows a basic implementation of the `Replacer` trait. This +/// can be done much more simply using the replacement string interpolation +/// support (e.g., `$first $last`), but this approach avoids needing to parse +/// the replacement string at all. +/// +/// ``` +/// use regex_lite::{Captures, Regex, Replacer}; +/// +/// struct NameSwapper; +/// +/// impl Replacer for NameSwapper { +/// fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { +/// dst.push_str(&caps["first"]); +/// dst.push_str(" "); +/// dst.push_str(&caps["last"]); +/// } +/// } +/// +/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); +/// let result = re.replace("Springsteen, Bruce", NameSwapper); +/// assert_eq!(result, "Bruce Springsteen"); +/// ``` pub trait Replacer { - /// Appends text to `dst` to replace the current match. + /// Appends possibly empty data to `dst` to replace the current match. /// /// The current match is represented by `caps`, which is guaranteed to /// have a match at capture group `0`. /// - /// For example, a no-op replacement would be - /// `dst.push_str(caps.get(0).unwrap().as_str())`. + /// For example, a no-op replacement would be `dst.push_str(&caps[0])`. fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); /// Return a fixed unchanging replacement string. /// - /// When doing replacements, if access to `Captures` is not needed (e.g., - /// the replacement byte string does not need `$` expansion), then it can - /// be beneficial to avoid finding sub-captures. + /// When doing replacements, if access to [`Captures`] is not needed (e.g., + /// the replacement string does not need `$` expansion), then it can be + /// beneficial to avoid finding sub-captures. /// - /// In general, this is called once for every call to `replacen`. + /// In general, this is called once for every call to a replacement routine + /// such as [`Regex::replace_all`]. fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> { None } - /// Return a `Replacer` that borrows and wraps this `Replacer`. + /// Returns a type that implements `Replacer`, but that borrows and wraps + /// this `Replacer`. /// /// This is useful when you want to take a generic `Replacer` (which might /// not be cloneable) and use it without consuming it, so it can be used @@ -985,21 +2471,6 @@ pub trait Replacer { } } -/// By-reference adaptor for a `Replacer` -/// -/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). -#[derive(Debug)] -pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); - -impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - self.0.replace_append(caps, dst) - } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - self.0.no_expansion() - } -} - impl<'a> Replacer for &'a str { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { caps.expand(*self, dst); @@ -1050,14 +2521,6 @@ impl<'a> Replacer for &'a Cow<'a, str> { } } -fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { - let s = t.as_ref(); - match s.find('$') { - Some(_) => None, - None => Some(Cow::Borrowed(s)), - } -} - impl<F, T> Replacer for F where F: FnMut(&Captures<'_>) -> T, @@ -1068,14 +2531,44 @@ where } } -/// `NoExpand` indicates literal string replacement. +/// A by-reference adaptor for a [`Replacer`]. +/// +/// This permits reusing the same `Replacer` value in multiple calls to a +/// replacement routine like [`Regex::replace_all`]. +/// +/// This type is created by [`Replacer::by_ref`]. +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.0.replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + self.0.no_expansion() + } +} + +/// A helper type for forcing literal string replacement. +/// +/// It can be used with routines like [`Regex::replace`] and +/// [`Regex::replace_all`] to do a literal string replacement without expanding +/// `$name` to their corresponding capture groups. This can be both convenient +/// (to avoid escaping `$`, for example) and faster (since capture groups +/// don't need to be found). +/// +/// `'s` is the lifetime of the literal string to use. +/// +/// # Example /// -/// It can be used with `replace` and `replace_all` to do a literal string -/// replacement without expanding `$name` to their corresponding capture -/// groups. This can be both convenient (to avoid escaping `$`, for example) -/// and performant (since capture groups don't need to be found). +/// ``` +/// use regex_lite::{NoExpand, Regex}; /// -/// `'t` is the lifetime of the literal text. +/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); +/// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); +/// assert_eq!(result, "$2 $last"); +/// ``` #[derive(Clone, Debug)] pub struct NoExpand<'t>(pub &'t str); @@ -1089,6 +2582,27 @@ impl<'t> Replacer for NoExpand<'t> { } } +/// Quickly checks the given replacement string for whether interpolation +/// should be done on it. It returns `None` if a `$` was found anywhere in the +/// given string, which suggests interpolation needs to be done. But if there's +/// no `$` anywhere, then interpolation definitely does not need to be done. In +/// that case, the given string is returned as a borrowed `Cow`. +/// +/// This is meant to be used to implement the `Replacer::no_expandsion` method +/// in its various trait impls. +fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { + let s = t.as_ref(); + match s.find('$') { + Some(_) => None, + None => Some(Cow::Borrowed(s)), + } +} + +/// A configurable builder for a [`Regex`]. +/// +/// This builder can be used to programmatically set flags such as `i` (case +/// insensitive) and `x` (for verbose mode). This builder can also be used to +/// configure things like a size limit on the compiled regular expression. #[derive(Debug)] pub struct RegexBuilder { pattern: String, @@ -1097,6 +2611,11 @@ pub struct RegexBuilder { } impl RegexBuilder { + /// Create a new builder with a default configuration for the given + /// pattern. + /// + /// If the pattern is invalid or exceeds the configured size limits, then + /// an error will be returned when [`RegexBuilder::build`] is called. pub fn new(pattern: &str) -> RegexBuilder { RegexBuilder { pattern: pattern.to_string(), @@ -1105,6 +2624,11 @@ impl RegexBuilder { } } + /// Compiles the pattern given to `RegexBuilder::new` with the + /// configuration set on this builder. + /// + /// If the pattern isn't a valid regex or if a configured size limit was + /// exceeded, then an error is returned. pub fn build(&self) -> Result<Regex, Error> { let hir = Hir::parse(self.hir_config, &self.pattern)?; let nfa = NFA::new(self.nfa_config, self.pattern.clone(), &hir)?; @@ -1117,66 +2641,315 @@ impl RegexBuilder { Ok(Regex { pikevm, pool }) } + /// This configures whether to enable ASCII case insensitive matching for + /// the entire pattern. + /// + /// This setting can also be configured using the inline flag `i` + /// in the pattern. For example, `(?i:foo)` matches `foo` case + /// insensitively while `(?-i:foo)` matches `foo` case sensitively. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") + /// .case_insensitive(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match("FoObarQuUx")); + /// // Even though case insensitive matching is enabled in the builder, + /// // it can be locally disabled within the pattern. In this case, + /// // `bar` is matched case sensitively. + /// assert!(!re.is_match("fooBARquux")); + /// ``` pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.case_insensitive = yes; self } + /// This configures multi-line mode for the entire pattern. + /// + /// Enabling multi-line mode changes the behavior of the `^` and `$` anchor + /// assertions. Instead of only matching at the beginning and end of a + /// haystack, respectively, multi-line mode causes them to match at the + /// beginning and end of a line *in addition* to the beginning and end of + /// a haystack. More precisely, `^` will match at the position immediately + /// following a `\n` and `$` will match at the position immediately + /// preceding a `\n`. + /// + /// The behavior of this option is impacted by the [`RegexBuilder::crlf`] + /// setting. Namely, CRLF mode changes the line terminator to be either + /// `\r` or `\n`, but never at the position between a `\r` and `\`n. + /// + /// This setting can also be configured using the inline flag `m` in the + /// pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .build() + /// .unwrap(); + /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range())); + /// ``` pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.multi_line = yes; self } - pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { - self.hir_config.flags.crlf = yes; + /// This configures dot-matches-new-line mode for the entire pattern. + /// + /// Perhaps surprisingly, the default behavior for `.` is not to match + /// any character, but rather, to match any character except for the line + /// terminator (which is `\n` by default). When this mode is enabled, the + /// behavior changes such that `.` truly matches any character. + /// + /// This setting can also be configured using the inline flag `s` in the + /// pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"foo.bar") + /// .dot_matches_new_line(true) + /// .build() + /// .unwrap(); + /// let hay = "foo\nbar"; + /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str())); + /// ``` + pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.dot_matches_new_line = yes; self } - pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { - self.hir_config.flags.dot_matches_new_line = yes; + /// This configures CRLF mode for the entire pattern. + /// + /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for + /// short) and `\n` ("line feed" or LF for short) are treated as line + /// terminators. This results in the following: + /// + /// * Unless dot-matches-new-line mode is enabled, `.` will now match any + /// character except for `\n` and `\r`. + /// * When multi-line mode is enabled, `^` will match immediatelly + /// following a `\n` or a `\r`. Similarly, `$` will match immediately + /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match between + /// `\r` and `\n`. + /// + /// This setting can also be configured using the inline flag `R` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = "\r\nfoo\r\n"; + /// // If CRLF mode weren't enabled here, then '$' wouldn't match + /// // immediately after 'foo', and thus no match would be found. + /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str())); + /// ``` + /// + /// This example demonstrates that `^` will never match at a position + /// between `\r` and `\n`. (`$` will similarly not match between a `\r` + /// and a `\n`.) + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^") + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = "\r\n\r\n"; + /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); + /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); + /// ``` + pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { + self.hir_config.flags.crlf = yes; self } + /// This configures swap-greed mode for the entire pattern. + /// + /// When swap-greed mode is enabled, patterns like `a+` will become + /// non-greedy and patterns like `a+?` will become greedy. In other words, + /// the meanings of `a+` and `a+?` are switched. + /// + /// This setting can also be configured using the inline flag `U` in the + /// pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"a+") + /// .swap_greed(true) + /// .build() + /// .unwrap(); + /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str())); + /// ``` pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.swap_greed = yes; self } + /// This configures verbose mode for the entire pattern. + /// + /// When enabled, whitespace will treated as insignifcant in the pattern + /// and `#` can be used to start a comment until the next new line. + /// + /// Normally, in most places in a pattern, whitespace is treated literally. + /// For example ` +` will match one or more ASCII whitespace characters. + /// + /// When verbose mode is enabled, `\#` can be used to match a literal `#` + /// and `\ ` can be used to match a literal ASCII whitespace character. + /// + /// Verbose mode is useful for permitting regexes to be formatted and + /// broken up more nicely. This may make them more easily readable. + /// + /// This setting can also be configured using the inline flag `x` in the + /// pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// let pat = r" + /// \b + /// (?<first>[A-Z]\w*) # always start with uppercase letter + /// \s+ # whitespace should separate names + /// (?: # middle name can be an initial! + /// (?:(?<initial>[A-Z])\.|(?<middle>[A-Z]\w*)) + /// \s+ + /// )? + /// (?<last>[A-Z]\w*) + /// \b + /// "; + /// let re = RegexBuilder::new(pat) + /// .ignore_whitespace(true) + /// .build() + /// .unwrap(); + /// + /// let caps = re.captures("Harry Potter").unwrap(); + /// assert_eq!("Harry", &caps["first"]); + /// assert_eq!("Potter", &caps["last"]); + /// + /// let caps = re.captures("Harry J. Potter").unwrap(); + /// assert_eq!("Harry", &caps["first"]); + /// // Since a middle name/initial isn't required for an overall match, + /// // we can't assume that 'initial' or 'middle' will be populated! + /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str())); + /// assert_eq!(None, caps.name("middle").map(|m| m.as_str())); + /// assert_eq!("Potter", &caps["last"]); + /// + /// let caps = re.captures("Harry James Potter").unwrap(); + /// assert_eq!("Harry", &caps["first"]); + /// // Since a middle name/initial isn't required for an overall match, + /// // we can't assume that 'initial' or 'middle' will be populated! + /// assert_eq!(None, caps.name("initial").map(|m| m.as_str())); + /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str())); + /// assert_eq!("Potter", &caps["last"]); + /// ``` pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.ignore_whitespace = yes; self } + /// Sets the approximate size limit, in bytes, of the compiled regex. + /// + /// This roughly corresponds to the number of heap memory, in bytes, + /// occupied by a single regex. If the regex would otherwise approximately + /// exceed this limit, then compiling that regex will fail. + /// + /// The main utility of a method like this is to avoid compiling regexes + /// that use an unexpected amount of resources, such as time and memory. + /// Even if the memory usage of a large regex is acceptable, its search + /// time may not be. Namely, worst case time complexity for search is `O(m + /// * n)`, where `m ~ len(pattern)` and `n ~ len(haystack)`. That is, + /// search time depends, in part, on the size of the compiled regex. This + /// means that putting a limit on the size of the regex limits how much a + /// regex can impact search time. + /// + /// The default for this is some reasonable number that permits most + /// patterns to compile successfully. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// assert!(RegexBuilder::new(r"\w").size_limit(100).build().is_err()); + /// ``` pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.nfa_config.size_limit = Some(limit); self } + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is allowed + /// to be. If the AST exceeds the given limit (e.g., with too many nested + /// groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an AST using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire AST is parsed. Therefore, if + /// callers want to put a limit on the amount of heap space used, then they + /// should impose a limit on the length, in bytes, of the concrete pattern + /// string. In particular, this is viable since this parser implementation + /// will limit itself to heap space proportional to the length of the + /// pattern string. See also the [untrusted inputs](crate#untrusted-input) + /// section in the top-level crate documentation for more information about + /// this. + /// + /// Note that a nest limit of `0` will return a nest limit error for most + /// patterns but not all. For example, a nest limit of `0` permits `a` but + /// not `ab`, since `ab` requires an explicit concatenation, which results + /// in a nest depth of `1`. In general, a nest limit is not something that + /// manifests in an obvious way in the concrete syntax, therefore, it + /// should not be used in a granular way. + /// + /// # Example + /// + /// ``` + /// use regex_lite::RegexBuilder; + /// + /// assert!(RegexBuilder::new(r"").nest_limit(0).build().is_ok()); + /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); + /// assert!(RegexBuilder::new(r"(a)").nest_limit(0).build().is_err()); + /// ``` pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { self.hir_config.nest_limit = limit; self } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn scratch() { - let re = Regex::new("abc").unwrap(); - assert_eq!(Some(0..3), re.find("abc").map(|m| m.range())); - - let re = Regex::new("abc").unwrap(); - assert_eq!(Some(4..7), re.find("foo abc").map(|m| m.range())); - - let re = Regex::new("^abc").unwrap(); - assert_eq!(Some(0..3), re.find("abc").map(|m| m.range())); - - let re = Regex::new("^abc").unwrap(); - assert_eq!(None, re.find("foo abc").map(|m| m.range())); - - let re = Regex::new("(?Rm)^foo$").unwrap(); - assert_eq!(Some(2..5), re.find("\r\nfoo\r\n").map(|m| m.range())); - } -} diff --git a/regex-lite/src/utf8.rs b/regex-lite/src/utf8.rs index 85ea99857c..5f2a6a153c 100644 --- a/regex-lite/src/utf8.rs +++ b/regex-lite/src/utf8.rs @@ -114,13 +114,13 @@ fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; - let class = CLASSES[b as usize]; + let class = CLASSES[usize::from(b)]; if *state == ACCEPT { *cp = (0xFF >> class) & (b as u32); } else { *cp = (b as u32 & 0b111111) | (*cp << 6); } - *state = STATES_FORWARD[*state + class as usize] as usize; + *state = usize::from(STATES_FORWARD[*state + usize::from(class)]); } #[cfg(test)] diff --git a/regex-lite/tests/lib.rs b/regex-lite/tests/lib.rs index fd75458122..757b394411 100644 --- a/regex-lite/tests/lib.rs +++ b/regex-lite/tests/lib.rs @@ -2,8 +2,6 @@ mod fuzz; mod string; const BLACKLIST: &[&str] = &[ - // CRLF-aware line anchors aren't supported in regex API yet. - "crlf", // Custom line terminators aren't supported in regex-lite. We could add it, // but it didn't seem worth it. "line-terminator", diff --git a/regex-syntax/README.md b/regex-syntax/README.md index ff4fe094c3..529513b0c8 100644 --- a/regex-syntax/README.md +++ b/regex-syntax/README.md @@ -4,7 +4,6 @@ This crate provides a robust regular expression parser. [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) -[![Rust](https://img.shields.io/badge/rust-1.28.0%2B-blue.svg?maxAge=3600)](https://github.com/rust-lang/regex) ### Documentation diff --git a/regex-syntax/src/hir/literal.rs b/regex-syntax/src/hir/literal.rs index 6e847a04b3..bcab2fb752 100644 --- a/regex-syntax/src/hir/literal.rs +++ b/regex-syntax/src/hir/literal.rs @@ -51,7 +51,7 @@ the "trickier" parts are how to combine literal sequences, and that is all implemented on [`Seq`]. */ -use core::{cmp, mem}; +use core::{cmp, mem, num::NonZeroUsize}; use alloc::{vec, vec::Vec}; @@ -1571,7 +1571,7 @@ impl Seq { /// unioning `self` with `other`. If either set is infinite, then this /// returns `None`. #[inline] - fn max_union_len(&self, other: &Seq) -> Option<usize> { + pub fn max_union_len(&self, other: &Seq) -> Option<usize> { let len1 = self.len()?; let len2 = other.len()?; Some(len1.saturating_add(len2)) @@ -1581,7 +1581,7 @@ impl Seq { /// cross product of `self` with `other`. If either set is infinite, then /// this returns `None`. #[inline] - fn max_cross_len(&self, other: &Seq) -> Option<usize> { + pub fn max_cross_len(&self, other: &Seq) -> Option<usize> { let len1 = self.len()?; let len2 = other.len()?; Some(len1.saturating_mul(len2)) @@ -1966,7 +1966,11 @@ impl Seq { } else { self.keep_last_bytes(keep); } - self.minimize_by_preference(); + if prefix { + if let Some(ref mut lits) = self.literals { + PreferenceTrie::minimize(lits, true); + } + } } // Check for a poison literal. A poison literal is one that is short // and is believed to have a very high match count. These poisons @@ -2016,7 +2020,7 @@ impl core::fmt::Debug for Seq { if let Some(lits) = self.literals() { f.debug_list().entries(lits.iter()).finish() } else { - write!(f, "[∅]") + write!(f, "[∞]") } } } @@ -2199,12 +2203,19 @@ impl core::fmt::Debug for Literal { /// never seen this show up on a profile. Because of the heuristic limits /// imposed on literal extractions, the size of the inputs here is usually /// very small.) -#[derive(Debug, Default)] +#[derive(Debug)] struct PreferenceTrie { /// The states in this trie. The index of a state in this vector is its ID. states: Vec<State>, + /// This vec indicates which states are match states. It always has + /// the same length as `states` and is indexed by the same state ID. + /// A state with identifier `sid` is a match state if and only if + /// `matches[sid].is_some()`. The option contains the index of the literal + /// corresponding to the match. The index is offset by 1 so that it fits in + /// a NonZeroUsize. + matches: Vec<Option<NonZeroUsize>>, /// The index to allocate to the next literal added to this trie. Starts at - /// 0 and increments by 1 for every literal successfully added to the trie. + /// 1 and increments by 1 for every literal successfully added to the trie. next_literal_index: usize, } @@ -2215,9 +2226,6 @@ struct State { /// are sorted by byte. There is at most one such transition for any /// particular byte. trans: Vec<(u8, usize)>, - /// Whether this is a matching state or not. If it is, then it contains the - /// index to the matching literal. - literal_index: Option<usize>, } impl PreferenceTrie { @@ -2234,14 +2242,18 @@ impl PreferenceTrie { use core::cell::RefCell; // MSRV(1.61): Use retain_mut here to avoid interior mutability. - let trie = RefCell::new(PreferenceTrie::default()); + let trie = RefCell::new(PreferenceTrie { + states: vec![], + matches: vec![], + next_literal_index: 1, + }); let mut make_inexact = vec![]; literals.retain(|lit| { match trie.borrow_mut().insert(lit.as_bytes()) { Ok(_) => true, Err(i) => { if !keep_exact { - make_inexact.push(i); + make_inexact.push(i.checked_sub(1).unwrap()); } false } @@ -2264,15 +2276,15 @@ impl PreferenceTrie { /// search. fn insert(&mut self, bytes: &[u8]) -> Result<usize, usize> { let mut prev = self.root(); - if let Some(idx) = self.states[prev].literal_index { - return Err(idx); + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); } for &b in bytes.iter() { match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { Ok(i) => { prev = self.states[prev].trans[i].1; - if let Some(idx) = self.states[prev].literal_index { - return Err(idx); + if let Some(idx) = self.matches[prev] { + return Err(idx.get()); } } Err(i) => { @@ -2284,7 +2296,7 @@ impl PreferenceTrie { } let idx = self.next_literal_index; self.next_literal_index += 1; - self.states[prev].literal_index = Some(idx); + self.matches[prev] = NonZeroUsize::new(idx); Ok(idx) } @@ -2301,6 +2313,7 @@ impl PreferenceTrie { fn create_state(&mut self) -> usize { let id = self.states.len(); self.states.push(State::default()); + self.matches.push(None); id } } diff --git a/regex-syntax/src/hir/mod.rs b/regex-syntax/src/hir/mod.rs index 66b1863a13..c6272cd4c8 100644 --- a/regex-syntax/src/hir/mod.rs +++ b/regex-syntax/src/hir/mod.rs @@ -88,6 +88,9 @@ pub enum ErrorKind { /// This error occurs when translating a pattern that could match a byte /// sequence that isn't UTF-8 and `utf8` was enabled. InvalidUtf8, + /// This error occurs when one uses a non-ASCII byte for a line terminator, + /// but where Unicode mode is enabled and UTF-8 mode is disabled. + InvalidLineTerminator, /// This occurs when an unrecognized Unicode property name could not /// be found. UnicodePropertyNotFound, @@ -120,6 +123,7 @@ impl core::fmt::Display for ErrorKind { let msg = match *self { UnicodeNotAllowed => "Unicode not allowed here", InvalidUtf8 => "pattern can match invalid UTF-8", + InvalidLineTerminator => "invalid line terminator, must be ASCII", UnicodePropertyNotFound => "Unicode property not found", UnicodePropertyValueNotFound => "Unicode property value not found", UnicodePerlClassNotFound => { @@ -553,7 +557,7 @@ impl Hir { // We rebuild the alternation by simplifying it. We proceed similarly // as the concatenation case. But in this case, there's no literal // simplification happening. We're just flattening alternations. - let mut new = vec![]; + let mut new = Vec::with_capacity(subs.len()); for sub in subs { let (kind, props) = sub.into_parts(); match kind { @@ -648,6 +652,12 @@ impl Hir { cls.push(ClassBytesRange::new(b'\0', b'\xFF')); Hir::class(Class::Bytes(cls)) } + Dot::AnyCharExcept(ch) => { + let mut cls = + ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); + cls.negate(); + Hir::class(Class::Unicode(cls)) + } Dot::AnyCharExceptLF => { let mut cls = ClassUnicode::empty(); cls.push(ClassUnicodeRange::new('\0', '\x09')); @@ -661,6 +671,12 @@ impl Hir { cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); Hir::class(Class::Unicode(cls)) } + Dot::AnyByteExcept(byte) => { + let mut cls = + ClassBytes::new([ClassBytesRange::new(byte, byte)]); + cls.negate(); + Hir::class(Class::Bytes(cls)) + } Dot::AnyByteExceptLF => { let mut cls = ClassBytes::empty(); cls.push(ClassBytesRange::new(b'\0', b'\x09')); @@ -1772,6 +1788,18 @@ pub enum Dot { /// /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. AnyByte, + /// Matches the UTF-8 encoding of any Unicode scalar value except for the + /// `char` given. + /// + /// This is equivalent to using `(?u-s:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyCharExceptLF` both exist because of legacy reasons. + /// `AnyCharExceptLF` will be dropped in the next breaking change release.) + AnyCharExcept(char), /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. /// /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. @@ -1781,6 +1809,17 @@ pub enum Dot { /// /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. AnyCharExceptCRLF, + /// Matches any byte value except for the `u8` given. + /// + /// This is equivalent to using `(?-us:.)` with the line terminator set + /// to a particular ASCII byte. (Because of peculiarities in the regex + /// engines, a line terminator must be a single byte. It follows that when + /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar + /// value. That is, ti must be ASCII.) + /// + /// (This and `AnyByteExceptLF` both exist because of legacy reasons. + /// `AnyByteExceptLF` will be dropped in the next breaking change release.) + AnyByteExcept(u8), /// Matches any byte value except for `\n`. /// /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. @@ -2416,10 +2455,10 @@ impl Properties { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); } - // If the static captures len of the sub-expression is not known or is - // zero, then it automatically propagates to the repetition, regardless - // of the repetition. Otherwise, it might change, but only when the - // repetition can match 0 times. + // If the static captures len of the sub-expression is not known or + // is greater than zero, then it automatically propagates to the + // repetition, regardless of the repetition. Otherwise, it might + // change, but only when the repetition can match 0 times. if rep.min == 0 && inner.static_explicit_captures_len.map_or(false, |len| len > 0) { diff --git a/regex-syntax/src/hir/print.rs b/regex-syntax/src/hir/print.rs index 44681c65b0..aa737a092d 100644 --- a/regex-syntax/src/hir/print.rs +++ b/regex-syntax/src/hir/print.rs @@ -89,9 +89,16 @@ impl<W: fmt::Write> Visitor for Writer<W> { fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { - // Empty is represented by nothing in the concrete syntax, and - // repetition operators are strictly suffix oriented. - HirKind::Empty | HirKind::Repetition(_) => {} + HirKind::Empty => { + // Technically an empty sub-expression could be "printed" by + // just ignoring it, but in practice, you could have a + // repetition operator attached to an empty expression, and you + // really need something in the concrete syntax to make that + // work as you'd expect. + self.wtr.write_str(r"(?:)")?; + } + // Repetition operators are strictly suffix oriented. + HirKind::Repetition(_) => {} HirKind::Literal(hir::Literal(ref bytes)) => { // See the comment on the 'Concat' and 'Alternation' case below // for why we put parens here. Literals are, conceptually, @@ -424,20 +431,20 @@ mod tests { // Test that various zero-length repetitions always translate to an // empty regex. This is more a property of HIR's smart constructors // than the printer though. - roundtrip("a{0}", ""); - roundtrip("(?:ab){0}", ""); + roundtrip("a{0}", "(?:)"); + roundtrip("(?:ab){0}", "(?:)"); #[cfg(feature = "unicode-gencat")] { - roundtrip(r"\p{any}{0}", ""); - roundtrip(r"\P{any}{0}", ""); + roundtrip(r"\p{any}{0}", "(?:)"); + roundtrip(r"\P{any}{0}", "(?:)"); } } #[test] fn print_group() { - roundtrip("()", "()"); - roundtrip("(?P<foo>)", "(?P<foo>)"); - roundtrip("(?:)", ""); + roundtrip("()", "((?:))"); + roundtrip("(?P<foo>)", "(?P<foo>(?:))"); + roundtrip("(?:)", "(?:)"); roundtrip("(a)", "(a)"); roundtrip("(?P<foo>a)", "(?P<foo>a)"); @@ -448,8 +455,8 @@ mod tests { #[test] fn print_alternation() { - roundtrip("|", "(?:|)"); - roundtrip("||", "(?:||)"); + roundtrip("|", "(?:(?:)|(?:))"); + roundtrip("||", "(?:(?:)|(?:)|(?:))"); roundtrip("a|b", "[ab]"); roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); diff --git a/regex-syntax/src/hir/translate.rs b/regex-syntax/src/hir/translate.rs index 6a176f726f..5430b51b27 100644 --- a/regex-syntax/src/hir/translate.rs +++ b/regex-syntax/src/hir/translate.rs @@ -19,6 +19,7 @@ type Result<T> = core::result::Result<T, Error>; #[derive(Clone, Debug)] pub struct TranslatorBuilder { utf8: bool, + line_terminator: u8, flags: Flags, } @@ -31,7 +32,11 @@ impl Default for TranslatorBuilder { impl TranslatorBuilder { /// Create a new translator builder with a default c onfiguration. pub fn new() -> TranslatorBuilder { - TranslatorBuilder { utf8: true, flags: Flags::default() } + TranslatorBuilder { + utf8: true, + line_terminator: b'\n', + flags: Flags::default(), + } } /// Build a translator using the current configuration. @@ -40,6 +45,7 @@ impl TranslatorBuilder { stack: RefCell::new(vec![]), flags: Cell::new(self.flags), utf8: self.utf8, + line_terminator: self.line_terminator, } } @@ -63,6 +69,31 @@ impl TranslatorBuilder { self } + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { + self.line_terminator = byte; + self + } + /// Enable or disable the case insensitive flag (`i`) by default. pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.case_insensitive = if yes { Some(true) } else { None }; @@ -120,6 +151,8 @@ pub struct Translator { flags: Cell<Flags>, /// Whether we're allowed to produce HIR that can match arbitrary bytes. utf8: bool, + /// The line terminator to use for `.`. + line_terminator: u8, } impl Translator { @@ -862,10 +895,38 @@ impl<'t, 'p> TranslatorI<'t, 'p> { } fn hir_dot(&self, span: Span) -> Result<Hir> { - if !self.flags().unicode() && self.trans().utf8 { + let (utf8, lineterm, flags) = + (self.trans().utf8, self.trans().line_terminator, self.flags()); + if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { return Err(self.error(span, ErrorKind::InvalidUtf8)); } - Ok(Hir::dot(self.flags().dot())) + let dot = if flags.dot_matches_new_line() { + if flags.unicode() { + hir::Dot::AnyChar + } else { + hir::Dot::AnyByte + } + } else { + if flags.unicode() { + if flags.crlf() { + hir::Dot::AnyCharExceptCRLF + } else { + if !lineterm.is_ascii() { + return Err( + self.error(span, ErrorKind::InvalidLineTerminator) + ); + } + hir::Dot::AnyCharExcept(char::from(lineterm)) + } + } else { + if flags.crlf() { + hir::Dot::AnyByteExceptCRLF + } else { + hir::Dot::AnyByteExcept(lineterm) + } + } + }; + Ok(Hir::dot(dot)) } fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { @@ -1209,30 +1270,6 @@ impl Flags { } } - fn dot(&self) -> hir::Dot { - if self.dot_matches_new_line() { - if self.unicode() { - hir::Dot::AnyChar - } else { - hir::Dot::AnyByte - } - } else { - if self.unicode() { - if self.crlf() { - hir::Dot::AnyCharExceptCRLF - } else { - hir::Dot::AnyCharExceptLF - } - } else { - if self.crlf() { - hir::Dot::AnyByteExceptCRLF - } else { - hir::Dot::AnyByteExceptLF - } - } - } - } - fn case_insensitive(&self) -> bool { self.case_insensitive.unwrap_or(false) } diff --git a/regex-syntax/src/lib.rs b/regex-syntax/src/lib.rs index e029ca1390..47d818a17f 100644 --- a/regex-syntax/src/lib.rs +++ b/regex-syntax/src/lib.rs @@ -168,6 +168,18 @@ The following features are available: #![forbid(unsafe_code)] #![deny(missing_docs, rustdoc::broken_intra_doc_links)] #![warn(missing_debug_implementations)] +// MSRV(1.62): Allow unused warnings. Needed for the 'allow' below, +// since the warning is no longer triggered in newer Rust releases. +// Once the 'allow(mutable_borrow_reservation_conflict)' can be +// removed, we can remove the 'allow(renamed_and_removed_lints)' too. +#![allow(renamed_and_removed_lints)] +// MSRV(1.62): This gets triggered on Rust <1.62, and since our MSRV +// is Rust 1.60 at the time of writing, a warning is displayed. But +// the lang team decided the code pattern flagged by this warning is +// OK, so the warning is innocuous. We can remove this explicit allow +// once we get to a Rust release where the warning is no longer +// triggered. I believe that's Rust 1.62. +#![allow(mutable_borrow_reservation_conflict)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] #[cfg(any(test, feature = "std"))] diff --git a/regex-syntax/src/parser.rs b/regex-syntax/src/parser.rs index 2e7a2bb80c..f482b84667 100644 --- a/regex-syntax/src/parser.rs +++ b/regex-syntax/src/parser.rs @@ -165,6 +165,31 @@ impl ParserBuilder { self } + /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. + /// + /// Namely, instead of `.` (by default) matching everything except for `\n`, + /// this will cause `.` to match everything except for the byte given. + /// + /// If `.` is used in a context where Unicode mode is enabled and this byte + /// isn't ASCII, then an error will be returned. When Unicode mode is + /// disabled, then any byte is permitted, but will return an error if UTF-8 + /// mode is enabled and it is a non-ASCII byte. + /// + /// In short, any ASCII value for a line terminator is always okay. But a + /// non-ASCII byte might result in an error depending on whether Unicode + /// mode or UTF-8 mode are enabled. + /// + /// Note that if `R` mode is enabled then it always takes precedence and + /// the line terminator will be treated as `\r` and `\n` simultaneously. + /// + /// Note also that this *doesn't* impact the look-around assertions + /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional + /// configuration in the regex engine itself. + pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder { + self.hir.line_terminator(byte); + self + } + /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively diff --git a/src/builders.rs b/src/builders.rs new file mode 100644 index 0000000000..285331a825 --- /dev/null +++ b/src/builders.rs @@ -0,0 +1,2525 @@ +#![allow(warnings)] + +// This module defines an internal builder that encapsulates all interaction +// with meta::Regex construction, and then 4 public API builders that wrap +// around it. The docs are essentially repeated on each of the 4 public +// builders, with tweaks to the examples as needed. +// +// The reason why there are so many builders is partially because of a misstep +// in the initial API design: the builder constructor takes in the pattern +// strings instead of using the `build` method to accept the pattern strings. +// This means `new` has a different signature for each builder. It probably +// would have been nicer to to use one builder with `fn new()`, and then add +// `build(pat)` and `build_many(pats)` constructors. +// +// The other reason is because I think the `bytes` module should probably +// have its own builder type. That way, it is completely isolated from the +// top-level API. +// +// If I could do it again, I'd probably have a `regex::Builder` and a +// `regex::bytes::Builder`. Each would have `build` and `build_set` (or +// `build_many`) methods for constructing a single pattern `Regex` and a +// multi-pattern `RegexSet`, respectively. + +use alloc::{ + string::{String, ToString}, + sync::Arc, + vec, + vec::Vec, +}; + +use regex_automata::{meta, util::syntax, MatchKind}; + +use crate::error::Error; + +/// A builder for constructing a `Regex`, `bytes::Regex`, `RegexSet` or a +/// `bytes::RegexSet`. +/// +/// This is essentially the implementation of the four different builder types +/// in the public API: `RegexBuilder`, `bytes::RegexBuilder`, `RegexSetBuilder` +/// and `bytes::RegexSetBuilder`. +#[derive(Clone, Debug)] +struct Builder { + pats: Vec<String>, + metac: meta::Config, + syntaxc: syntax::Config, +} + +impl Default for Builder { + fn default() -> Builder { + let metac = meta::Config::new() + .nfa_size_limit(Some(10 * (1 << 20))) + .hybrid_cache_capacity(2 * (1 << 20)); + Builder { pats: vec![], metac, syntaxc: syntax::Config::default() } + } +} + +impl Builder { + fn new<I, S>(patterns: I) -> Builder + where + S: AsRef<str>, + I: IntoIterator<Item = S>, + { + let mut b = Builder::default(); + b.pats.extend(patterns.into_iter().map(|p| p.as_ref().to_string())); + b + } + + fn build_one_string(&self) -> Result<crate::Regex, Error> { + assert_eq!(1, self.pats.len()); + let metac = self + .metac + .clone() + .match_kind(MatchKind::LeftmostFirst) + .utf8_empty(true); + let syntaxc = self.syntaxc.clone().utf8(true); + let pattern = Arc::from(self.pats[0].as_str()); + meta::Builder::new() + .configure(metac) + .syntax(syntaxc) + .build(&pattern) + .map(|meta| crate::Regex { meta, pattern }) + .map_err(Error::from_meta_build_error) + } + + fn build_one_bytes(&self) -> Result<crate::bytes::Regex, Error> { + assert_eq!(1, self.pats.len()); + let metac = self + .metac + .clone() + .match_kind(MatchKind::LeftmostFirst) + .utf8_empty(false); + let syntaxc = self.syntaxc.clone().utf8(false); + let pattern = Arc::from(self.pats[0].as_str()); + meta::Builder::new() + .configure(metac) + .syntax(syntaxc) + .build(&pattern) + .map(|meta| crate::bytes::Regex { meta, pattern }) + .map_err(Error::from_meta_build_error) + } + + fn build_many_string(&self) -> Result<crate::RegexSet, Error> { + let metac = + self.metac.clone().match_kind(MatchKind::All).utf8_empty(true); + let syntaxc = self.syntaxc.clone().utf8(true); + let patterns = Arc::from(self.pats.as_slice()); + meta::Builder::new() + .configure(metac) + .syntax(syntaxc) + .build_many(&patterns) + .map(|meta| crate::RegexSet { meta, patterns }) + .map_err(Error::from_meta_build_error) + } + + fn build_many_bytes(&self) -> Result<crate::bytes::RegexSet, Error> { + let metac = + self.metac.clone().match_kind(MatchKind::All).utf8_empty(false); + let syntaxc = self.syntaxc.clone().utf8(false); + let patterns = Arc::from(self.pats.as_slice()); + meta::Builder::new() + .configure(metac) + .syntax(syntaxc) + .build_many(&patterns) + .map(|meta| crate::bytes::RegexSet { meta, patterns }) + .map_err(Error::from_meta_build_error) + } + + fn case_insensitive(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.case_insensitive(yes); + self + } + + fn multi_line(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.multi_line(yes); + self + } + + fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.dot_matches_new_line(yes); + self + } + + fn crlf(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.crlf(yes); + self + } + + fn line_terminator(&mut self, byte: u8) -> &mut Builder { + self.metac = self.metac.clone().line_terminator(byte); + self.syntaxc = self.syntaxc.line_terminator(byte); + self + } + + fn swap_greed(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.swap_greed(yes); + self + } + + fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.ignore_whitespace(yes); + self + } + + fn unicode(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.unicode(yes); + self + } + + fn octal(&mut self, yes: bool) -> &mut Builder { + self.syntaxc = self.syntaxc.octal(yes); + self + } + + fn size_limit(&mut self, limit: usize) -> &mut Builder { + self.metac = self.metac.clone().nfa_size_limit(Some(limit)); + self + } + + fn dfa_size_limit(&mut self, limit: usize) -> &mut Builder { + self.metac = self.metac.clone().hybrid_cache_capacity(limit); + self + } + + fn nest_limit(&mut self, limit: u32) -> &mut Builder { + self.syntaxc = self.syntaxc.nest_limit(limit); + self + } +} + +pub(crate) mod string { + use crate::{error::Error, Regex, RegexSet}; + + use super::Builder; + + /// A configurable builder for a [`Regex`]. + /// + /// This builder can be used to programmatically set flags such as `i` + /// (case insensitive) and `x` (for verbose mode). This builder can also be + /// used to configure things like the line terminator and a size limit on + /// the compiled regular expression. + #[derive(Clone, Debug)] + pub struct RegexBuilder { + builder: Builder, + } + + impl RegexBuilder { + /// Create a new builder with a default configuration for the given + /// pattern. + /// + /// If the pattern is invalid or exceeds the configured size limits, + /// then an error will be returned when [`RegexBuilder::build`] is + /// called. + pub fn new(pattern: &str) -> RegexBuilder { + RegexBuilder { builder: Builder::new([pattern]) } + } + + /// Compiles the pattern given to `RegexBuilder::new` with the + /// configuration set on this builder. + /// + /// If the pattern isn't a valid regex or if a configured size limit + /// was exceeded, then an error is returned. + pub fn build(&self) -> Result<Regex, Error> { + self.builder.build_one_string() + } + + /// This configures Unicode mode for the entire pattern. + /// + /// Enabling Unicode mode does a number of things: + /// + /// * Most fundamentally, it causes the fundamental atom of matching + /// to be a single codepoint. When Unicode mode is disabled, it's a + /// single byte. For example, when Unicode mode is enabled, `.` will + /// match `💩` once, where as it will match 4 times when Unicode mode + /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) + /// * Case insensitive matching uses Unicode simple case folding rules. + /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are + /// available. + /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and + /// `\d`. + /// * The word boundary assertions, `\b` and `\B`, use the Unicode + /// definition of a word character. + /// + /// Note that if Unicode mode is disabled, then the regex will fail to + /// compile if it could match invalid UTF-8. For example, when Unicode + /// mode is disabled, then since `.` matches any byte (except for + /// `\n`), then it can match invalid UTF-8 and thus building a regex + /// from it will fail. Another example is `\w` and `\W`. Since `\w` can + /// only match ASCII bytes when Unicode mode is disabled, it's allowed. + /// But `\W` can match more than ASCII bytes, including invalid UTF-8, + /// and so it is not allowed. This restriction can be lifted only by + /// using a [`bytes::Regex`](crate::bytes::Regex). + /// + /// For more details on the Unicode support in this crate, see the + /// [Unicode section](crate#unicode) in this crate's top-level + /// documentation. + /// + /// The default for this is `true`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"\w") + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally greek letters would be included in \w, but since + /// // Unicode mode is disabled, it only matches ASCII letters. + /// assert!(!re.is_match("δ")); + /// + /// let re = RegexBuilder::new(r"s") + /// .case_insensitive(true) + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally 'ſ' is included when searching for 's' case + /// // insensitively due to Unicode's simple case folding rules. But + /// // when Unicode mode is disabled, only ASCII case insensitive rules + /// // are used. + /// assert!(!re.is_match("ſ")); + /// ``` + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.unicode(yes); + self + } + + /// This configures whether to enable case insensitive matching for the + /// entire pattern. + /// + /// This setting can also be configured using the inline flag `i` + /// in the pattern. For example, `(?i:foo)` matches `foo` case + /// insensitively while `(?-i:foo)` matches `foo` case sensitively. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") + /// .case_insensitive(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match("FoObarQuUx")); + /// // Even though case insensitive matching is enabled in the builder, + /// // it can be locally disabled within the pattern. In this case, + /// // `bar` is matched case sensitively. + /// assert!(!re.is_match("fooBARquux")); + /// ``` + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.case_insensitive(yes); + self + } + + /// This configures multi-line mode for the entire pattern. + /// + /// Enabling multi-line mode changes the behavior of the `^` and `$` + /// anchor assertions. Instead of only matching at the beginning and + /// end of a haystack, respectively, multi-line mode causes them to + /// match at the beginning and end of a line *in addition* to the + /// beginning and end of a haystack. More precisely, `^` will match at + /// the position immediately following a `\n` and `$` will match at the + /// position immediately preceding a `\n`. + /// + /// The behavior of this option can be impacted by other settings too: + /// + /// * The [`RegexBuilder::line_terminator`] option changes `\n` above + /// to any ASCII byte. + /// * The [`RegexBuilder::crlf`] option changes the line terminator to + /// be either `\r` or `\n`, but never at the position between a `\r` + /// and `\n`. + /// + /// This setting can also be configured using the inline flag `m` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .build() + /// .unwrap(); + /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range())); + /// ``` + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.multi_line(yes); + self + } + + /// This configures dot-matches-new-line mode for the entire pattern. + /// + /// Perhaps surprisingly, the default behavior for `.` is not to match + /// any character, but rather, to match any character except for the + /// line terminator (which is `\n` by default). When this mode is + /// enabled, the behavior changes such that `.` truly matches any + /// character. + /// + /// This setting can also be configured using the inline flag `s` in + /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent + /// regexes. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"foo.bar") + /// .dot_matches_new_line(true) + /// .build() + /// .unwrap(); + /// let hay = "foo\nbar"; + /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str())); + /// ``` + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.builder.dot_matches_new_line(yes); + self + } + + /// This configures CRLF mode for the entire pattern. + /// + /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for + /// short) and `\n` ("line feed" or LF for short) are treated as line + /// terminators. This results in the following: + /// + /// * Unless dot-matches-new-line mode is enabled, `.` will now match + /// any character except for `\n` and `\r`. + /// * When multi-line mode is enabled, `^` will match immediatelly + /// following a `\n` or a `\r`. Similarly, `$` will match immediately + /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match + /// between `\r` and `\n`. + /// + /// This setting can also be configured using the inline flag `R` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = "\r\nfoo\r\n"; + /// // If CRLF mode weren't enabled here, then '$' wouldn't match + /// // immediately after 'foo', and thus no match would be found. + /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str())); + /// ``` + /// + /// This example demonstrates that `^` will never match at a position + /// between `\r` and `\n`. (`$` will similarly not match between a `\r` + /// and a `\n`.) + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^") + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = "\r\n\r\n"; + /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); + /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); + /// ``` + pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.crlf(yes); + self + } + + /// Configures the line terminator to be used by the regex. + /// + /// The line terminator is relevant in two ways for a particular regex: + /// + /// * When dot-matches-new-line mode is *not* enabled (the default), + /// then `.` will match any character except for the configured line + /// terminator. + /// * When multi-line mode is enabled (not the default), then `^` and + /// `$` will match immediately after and before, respectively, a line + /// terminator. + /// + /// In both cases, if CRLF mode is enabled in a particular context, + /// then it takes precedence over any configured line terminator. + /// + /// This option cannot be configured from within the pattern. + /// + /// The default line terminator is `\n`. + /// + /// # Example + /// + /// This shows how to treat the NUL byte as a line terminator. This can + /// be a useful heuristic when searching binary data. + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// let hay = "\x00foo\x00"; + /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); + /// ``` + /// + /// This example shows that the behavior of `.` is impacted by this + /// setting as well: + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r".") + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// assert!(re.is_match("\n")); + /// assert!(!re.is_match("\x00")); + /// ``` + /// + /// This shows that building a regex will fail if the byte given + /// is not ASCII and the pattern could result in matching invalid + /// UTF-8. This is because any singular non-ASCII byte is not valid + /// UTF-8, and it is not permitted for a [`Regex`] to match invalid + /// UTF-8. (It is permissible to use a non-ASCII byte when building a + /// [`bytes::Regex`](crate::bytes::Regex).) + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// assert!(RegexBuilder::new(r".").line_terminator(0x80).build().is_err()); + /// // Note that using a non-ASCII byte isn't enough on its own to + /// // cause regex compilation to fail. You actually have to make use + /// // of it in the regex in a way that leads to matching invalid + /// // UTF-8. If you don't, then regex compilation will succeed! + /// assert!(RegexBuilder::new(r"a").line_terminator(0x80).build().is_ok()); + /// ``` + pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { + self.builder.line_terminator(byte); + self + } + + /// This configures swap-greed mode for the entire pattern. + /// + /// When swap-greed mode is enabled, patterns like `a+` will become + /// non-greedy and patterns like `a+?` will become greedy. In other + /// words, the meanings of `a+` and `a+?` are switched. + /// + /// This setting can also be configured using the inline flag `U` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"a+") + /// .swap_greed(true) + /// .build() + /// .unwrap(); + /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str())); + /// ``` + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.swap_greed(yes); + self + } + + /// This configures verbose mode for the entire pattern. + /// + /// When enabled, whitespace will treated as insignifcant in the + /// pattern and `#` can be used to start a comment until the next new + /// line. + /// + /// Normally, in most places in a pattern, whitespace is treated + /// literally. For example ` +` will match one or more ASCII whitespace + /// characters. + /// + /// When verbose mode is enabled, `\#` can be used to match a literal + /// `#` and `\ ` can be used to match a literal ASCII whitespace + /// character. + /// + /// Verbose mode is useful for permitting regexes to be formatted and + /// broken up more nicely. This may make them more easily readable. + /// + /// This setting can also be configured using the inline flag `x` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// let pat = r" + /// \b + /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter + /// [\s--\n]+ # whitespace should separate names + /// (?: # middle name can be an initial! + /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) + /// [\s--\n]+ + /// )? + /// (?<last>\p{Uppercase}\w*) + /// \b + /// "; + /// let re = RegexBuilder::new(pat) + /// .ignore_whitespace(true) + /// .build() + /// .unwrap(); + /// + /// let caps = re.captures("Harry Potter").unwrap(); + /// assert_eq!("Harry", &caps["first"]); + /// assert_eq!("Potter", &caps["last"]); + /// + /// let caps = re.captures("Harry J. Potter").unwrap(); + /// assert_eq!("Harry", &caps["first"]); + /// // Since a middle name/initial isn't required for an overall match, + /// // we can't assume that 'initial' or 'middle' will be populated! + /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str())); + /// assert_eq!(None, caps.name("middle").map(|m| m.as_str())); + /// assert_eq!("Potter", &caps["last"]); + /// + /// let caps = re.captures("Harry James Potter").unwrap(); + /// assert_eq!("Harry", &caps["first"]); + /// // Since a middle name/initial isn't required for an overall match, + /// // we can't assume that 'initial' or 'middle' will be populated! + /// assert_eq!(None, caps.name("initial").map(|m| m.as_str())); + /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str())); + /// assert_eq!("Potter", &caps["last"]); + /// ``` + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.ignore_whitespace(yes); + self + } + + /// This configures octal mode for the entire pattern. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints + /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all + /// equivalent patterns, where the last example shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, + /// it does make good error messages harder. That is, in PCRE based + /// regex engines, syntax like `\1` invokes a backreference, which is + /// explicitly unsupported this library. However, many users expect + /// backreferences to be supported. Therefore, when octal support + /// is disabled, the error message will explicitly mention that + /// backreferences aren't supported. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// // Normally this pattern would not compile, with an error message + /// // about backreferences not being supported. But with octal mode + /// // enabled, octal escape sequences work. + /// let re = RegexBuilder::new(r"\141") + /// .octal(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match("a")); + /// ``` + pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.octal(yes); + self + } + + /// Sets the approximate size limit, in bytes, of the compiled regex. + /// + /// This roughly corresponds to the number of heap memory, in + /// bytes, occupied by a single regex. If the regex would otherwise + /// approximately exceed this limit, then compiling that regex will + /// fail. + /// + /// The main utility of a method like this is to avoid compiling + /// regexes that use an unexpected amount of resources, such as + /// time and memory. Even if the memory usage of a large regex is + /// acceptable, its search time may not be. Namely, worst case time + /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and + /// `n ~ len(haystack)`. That is, search time depends, in part, on the + /// size of the compiled regex. This means that putting a limit on the + /// size of the regex limits how much a regex can impact search time. + /// + /// For more information about regex size limits, see the section on + /// [untrusted inputs](crate#untrusted-input) in the top-level crate + /// documentation. + /// + /// The default for this is some reasonable number that permits most + /// patterns to compile successfully. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// // It may surprise you how big some seemingly small patterns can + /// // be! Since \w is Unicode aware, this generates a regex that can + /// // match approximately 140,000 distinct codepoints. + /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err()); + /// ``` + pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { + self.builder.size_limit(bytes); + self + } + + /// Set the approximate capacity, in bytes, of the cache of transitions + /// used by the lazy DFA. + /// + /// While the lazy DFA isn't always used, in tends to be the most + /// commonly use regex engine in default configurations. It tends to + /// adopt the performance profile of a fully build DFA, but without the + /// downside of taking worst case exponential time to build. + /// + /// The downside is that it needs to keep a cache of transitions and + /// states that are built while running a search, and this cache + /// can fill up. When it fills up, the cache will reset itself. Any + /// previously generated states and transitions will then need to be + /// re-generated. If this happens too many times, then this library + /// will bail out of using the lazy DFA and switch to a different regex + /// engine. + /// + /// If your regex provokes this particular downside of the lazy DFA, + /// then it may be beneficial to increase its cache capacity. This will + /// potentially reduce the frequency of cache resetting (ideally to + /// `0`). While it won't fix all potential performance problems with + /// the lazy DFA, increasing the cache capacity does fix some. + /// + /// There is no easy way to determine, a priori, whether increasing + /// this cache capacity will help. In general, the larger your regex, + /// the more cache it's likely to use. But that isn't an ironclad rule. + /// For example, a regex like `[01]*1[01]{N}` would normally produce a + /// fully build DFA that is exponential in size with respect to `N`. + /// The lazy DFA will prevent exponential space blow-up, but it cache + /// is likely to fill up, even when it's large and even for smallish + /// values of `N`. + /// + /// If you aren't sure whether this helps or not, it is sensible to + /// set this to some arbitrarily large number in testing, such as + /// `usize::MAX`. Namely, this represents the amount of capacity that + /// *may* be used. It's probably not a good idea to use `usize::MAX` in + /// production though, since it implies there are no controls on heap + /// memory used by this library during a search. In effect, set it to + /// whatever you're willing to allocate for a single regex search. + pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { + self.builder.dfa_size_limit(bytes); + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is + /// allowed to be. If the AST exceeds the given limit (e.g., with too + /// many nested groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an AST using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire AST is parsed. + /// Therefore, if callers want to put a limit on the amount of heap + /// space used, then they should impose a limit on the length, in + /// bytes, of the concrete pattern string. In particular, this is + /// viable since this parser implementation will limit itself to heap + /// space proportional to the length of the pattern string. See also + /// the [untrusted inputs](crate#untrusted-input) section in the + /// top-level crate documentation for more information about this. + /// + /// Note that a nest limit of `0` will return a nest limit error for + /// most patterns but not all. For example, a nest limit of `0` permits + /// `a` but not `ab`, since `ab` requires an explicit concatenation, + /// which results in a nest depth of `1`. In general, a nest limit is + /// not something that manifests in an obvious way in the concrete + /// syntax, therefore, it should not be used in a granular way. + /// + /// # Example + /// + /// ``` + /// use regex::RegexBuilder; + /// + /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); + /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err()); + /// ``` + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.builder.nest_limit(limit); + self + } + } + + /// A configurable builder for a [`RegexSet`]. + /// + /// This builder can be used to programmatically set flags such as + /// `i` (case insensitive) and `x` (for verbose mode). This builder + /// can also be used to configure things like the line terminator + /// and a size limit on the compiled regular expression. + #[derive(Clone, Debug)] + pub struct RegexSetBuilder { + builder: Builder, + } + + impl RegexSetBuilder { + /// Create a new builder with a default configuration for the given + /// patterns. + /// + /// If the patterns are invalid or exceed the configured size limits, + /// then an error will be returned when [`RegexSetBuilder::build`] is + /// called. + pub fn new<I, S>(patterns: I) -> RegexSetBuilder + where + I: IntoIterator<Item = S>, + S: AsRef<str>, + { + RegexSetBuilder { builder: Builder::new(patterns) } + } + + /// Compiles the patterns given to `RegexSetBuilder::new` with the + /// configuration set on this builder. + /// + /// If the patterns aren't valid regexes or if a configured size limit + /// was exceeded, then an error is returned. + pub fn build(&self) -> Result<RegexSet, Error> { + self.builder.build_many_string() + } + + /// This configures Unicode mode for the all of the patterns. + /// + /// Enabling Unicode mode does a number of things: + /// + /// * Most fundamentally, it causes the fundamental atom of matching + /// to be a single codepoint. When Unicode mode is disabled, it's a + /// single byte. For example, when Unicode mode is enabled, `.` will + /// match `💩` once, where as it will match 4 times when Unicode mode + /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) + /// * Case insensitive matching uses Unicode simple case folding rules. + /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are + /// available. + /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and + /// `\d`. + /// * The word boundary assertions, `\b` and `\B`, use the Unicode + /// definition of a word character. + /// + /// Note that if Unicode mode is disabled, then the regex will fail to + /// compile if it could match invalid UTF-8. For example, when Unicode + /// mode is disabled, then since `.` matches any byte (except for + /// `\n`), then it can match invalid UTF-8 and thus building a regex + /// from it will fail. Another example is `\w` and `\W`. Since `\w` can + /// only match ASCII bytes when Unicode mode is disabled, it's allowed. + /// But `\W` can match more than ASCII bytes, including invalid UTF-8, + /// and so it is not allowed. This restriction can be lifted only by + /// using a [`bytes::RegexSet`](crate::bytes::RegexSet). + /// + /// For more details on the Unicode support in this crate, see the + /// [Unicode section](crate#unicode) in this crate's top-level + /// documentation. + /// + /// The default for this is `true`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"\w"]) + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally greek letters would be included in \w, but since + /// // Unicode mode is disabled, it only matches ASCII letters. + /// assert!(!re.is_match("δ")); + /// + /// let re = RegexSetBuilder::new([r"s"]) + /// .case_insensitive(true) + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally 'ſ' is included when searching for 's' case + /// // insensitively due to Unicode's simple case folding rules. But + /// // when Unicode mode is disabled, only ASCII case insensitive rules + /// // are used. + /// assert!(!re.is_match("ſ")); + /// ``` + pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.unicode(yes); + self + } + + /// This configures whether to enable case insensitive matching for all + /// of the patterns. + /// + /// This setting can also be configured using the inline flag `i` + /// in the pattern. For example, `(?i:foo)` matches `foo` case + /// insensitively while `(?-i:foo)` matches `foo` case sensitively. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"]) + /// .case_insensitive(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match("FoObarQuUx")); + /// // Even though case insensitive matching is enabled in the builder, + /// // it can be locally disabled within the pattern. In this case, + /// // `bar` is matched case sensitively. + /// assert!(!re.is_match("fooBARquux")); + /// ``` + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.case_insensitive(yes); + self + } + + /// This configures multi-line mode for all of the patterns. + /// + /// Enabling multi-line mode changes the behavior of the `^` and `$` + /// anchor assertions. Instead of only matching at the beginning and + /// end of a haystack, respectively, multi-line mode causes them to + /// match at the beginning and end of a line *in addition* to the + /// beginning and end of a haystack. More precisely, `^` will match at + /// the position immediately following a `\n` and `$` will match at the + /// position immediately preceding a `\n`. + /// + /// The behavior of this option can be impacted by other settings too: + /// + /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` + /// above to any ASCII byte. + /// * The [`RegexSetBuilder::crlf`] option changes the line terminator + /// to be either `\r` or `\n`, but never at the position between a `\r` + /// and `\n`. + /// + /// This setting can also be configured using the inline flag `m` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^foo$"]) + /// .multi_line(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match("\nfoo\n")); + /// ``` + pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.multi_line(yes); + self + } + + /// This configures dot-matches-new-line mode for the entire pattern. + /// + /// Perhaps surprisingly, the default behavior for `.` is not to match + /// any character, but rather, to match any character except for the + /// line terminator (which is `\n` by default). When this mode is + /// enabled, the behavior changes such that `.` truly matches any + /// character. + /// + /// This setting can also be configured using the inline flag `s` in + /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent + /// regexes. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"foo.bar"]) + /// .dot_matches_new_line(true) + /// .build() + /// .unwrap(); + /// let hay = "foo\nbar"; + /// assert!(re.is_match(hay)); + /// ``` + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.builder.dot_matches_new_line(yes); + self + } + + /// This configures CRLF mode for all of the patterns. + /// + /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for + /// short) and `\n` ("line feed" or LF for short) are treated as line + /// terminators. This results in the following: + /// + /// * Unless dot-matches-new-line mode is enabled, `.` will now match + /// any character except for `\n` and `\r`. + /// * When multi-line mode is enabled, `^` will match immediatelly + /// following a `\n` or a `\r`. Similarly, `$` will match immediately + /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match + /// between `\r` and `\n`. + /// + /// This setting can also be configured using the inline flag `R` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^foo$"]) + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = "\r\nfoo\r\n"; + /// // If CRLF mode weren't enabled here, then '$' wouldn't match + /// // immediately after 'foo', and thus no match would be found. + /// assert!(re.is_match(hay)); + /// ``` + /// + /// This example demonstrates that `^` will never match at a position + /// between `\r` and `\n`. (`$` will similarly not match between a `\r` + /// and a `\n`.) + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^\n"]) + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// assert!(!re.is_match("\r\n")); + /// ``` + pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.crlf(yes); + self + } + + /// Configures the line terminator to be used by the regex. + /// + /// The line terminator is relevant in two ways for a particular regex: + /// + /// * When dot-matches-new-line mode is *not* enabled (the default), + /// then `.` will match any character except for the configured line + /// terminator. + /// * When multi-line mode is enabled (not the default), then `^` and + /// `$` will match immediately after and before, respectively, a line + /// terminator. + /// + /// In both cases, if CRLF mode is enabled in a particular context, + /// then it takes precedence over any configured line terminator. + /// + /// This option cannot be configured from within the pattern. + /// + /// The default line terminator is `\n`. + /// + /// # Example + /// + /// This shows how to treat the NUL byte as a line terminator. This can + /// be a useful heuristic when searching binary data. + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^foo$"]) + /// .multi_line(true) + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// let hay = "\x00foo\x00"; + /// assert!(re.is_match(hay)); + /// ``` + /// + /// This example shows that the behavior of `.` is impacted by this + /// setting as well: + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"."]) + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// assert!(re.is_match("\n")); + /// assert!(!re.is_match("\x00")); + /// ``` + /// + /// This shows that building a regex will fail if the byte given + /// is not ASCII and the pattern could result in matching invalid + /// UTF-8. This is because any singular non-ASCII byte is not valid + /// UTF-8, and it is not permitted for a [`RegexSet`] to match invalid + /// UTF-8. (It is permissible to use a non-ASCII byte when building a + /// [`bytes::RegexSet`](crate::bytes::RegexSet).) + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// assert!( + /// RegexSetBuilder::new([r"."]) + /// .line_terminator(0x80) + /// .build() + /// .is_err() + /// ); + /// // Note that using a non-ASCII byte isn't enough on its own to + /// // cause regex compilation to fail. You actually have to make use + /// // of it in the regex in a way that leads to matching invalid + /// // UTF-8. If you don't, then regex compilation will succeed! + /// assert!( + /// RegexSetBuilder::new([r"a"]) + /// .line_terminator(0x80) + /// .build() + /// .is_ok() + /// ); + /// ``` + pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { + self.builder.line_terminator(byte); + self + } + + /// This configures swap-greed mode for all of the patterns. + /// + /// When swap-greed mode is enabled, patterns like `a+` will become + /// non-greedy and patterns like `a+?` will become greedy. In other + /// words, the meanings of `a+` and `a+?` are switched. + /// + /// This setting can also be configured using the inline flag `U` in + /// the pattern. + /// + /// Note that this is generally not useful for a `RegexSet` since a + /// `RegexSet` can only report whether a pattern matches or not. Since + /// greediness never impacts whether a match is found or not (only the + /// offsets of the match), it follows that whether parts of a pattern + /// are greedy or not doesn't matter for a `RegexSet`. + /// + /// The default for this is `false`. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.swap_greed(yes); + self + } + + /// This configures verbose mode for all of the patterns. + /// + /// When enabled, whitespace will treated as insignifcant in the + /// pattern and `#` can be used to start a comment until the next new + /// line. + /// + /// Normally, in most places in a pattern, whitespace is treated + /// literally. For example ` +` will match one or more ASCII whitespace + /// characters. + /// + /// When verbose mode is enabled, `\#` can be used to match a literal + /// `#` and `\ ` can be used to match a literal ASCII whitespace + /// character. + /// + /// Verbose mode is useful for permitting regexes to be formatted and + /// broken up more nicely. This may make them more easily readable. + /// + /// This setting can also be configured using the inline flag `x` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// let pat = r" + /// \b + /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter + /// [\s--\n]+ # whitespace should separate names + /// (?: # middle name can be an initial! + /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) + /// [\s--\n]+ + /// )? + /// (?<last>\p{Uppercase}\w*) + /// \b + /// "; + /// let re = RegexSetBuilder::new([pat]) + /// .ignore_whitespace(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match("Harry Potter")); + /// assert!(re.is_match("Harry J. Potter")); + /// assert!(re.is_match("Harry James Potter")); + /// assert!(!re.is_match("harry J. Potter")); + /// ``` + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.builder.ignore_whitespace(yes); + self + } + + /// This configures octal mode for all of the patterns. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints + /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all + /// equivalent patterns, where the last example shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, + /// it does make good error messages harder. That is, in PCRE based + /// regex engines, syntax like `\1` invokes a backreference, which is + /// explicitly unsupported this library. However, many users expect + /// backreferences to be supported. Therefore, when octal support + /// is disabled, the error message will explicitly mention that + /// backreferences aren't supported. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// // Normally this pattern would not compile, with an error message + /// // about backreferences not being supported. But with octal mode + /// // enabled, octal escape sequences work. + /// let re = RegexSetBuilder::new([r"\141"]) + /// .octal(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match("a")); + /// ``` + pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.octal(yes); + self + } + + /// Sets the approximate size limit, in bytes, of the compiled regex. + /// + /// This roughly corresponds to the number of heap memory, in + /// bytes, occupied by a single regex. If the regex would otherwise + /// approximately exceed this limit, then compiling that regex will + /// fail. + /// + /// The main utility of a method like this is to avoid compiling + /// regexes that use an unexpected amount of resources, such as + /// time and memory. Even if the memory usage of a large regex is + /// acceptable, its search time may not be. Namely, worst case time + /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and + /// `n ~ len(haystack)`. That is, search time depends, in part, on the + /// size of the compiled regex. This means that putting a limit on the + /// size of the regex limits how much a regex can impact search time. + /// + /// For more information about regex size limits, see the section on + /// [untrusted inputs](crate#untrusted-input) in the top-level crate + /// documentation. + /// + /// The default for this is some reasonable number that permits most + /// patterns to compile successfully. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// // It may surprise you how big some seemingly small patterns can + /// // be! Since \w is Unicode aware, this generates a regex that can + /// // match approximately 140,000 distinct codepoints. + /// assert!( + /// RegexSetBuilder::new([r"\w"]) + /// .size_limit(45_000) + /// .build() + /// .is_err() + /// ); + /// ``` + pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { + self.builder.size_limit(bytes); + self + } + + /// Set the approximate capacity, in bytes, of the cache of transitions + /// used by the lazy DFA. + /// + /// While the lazy DFA isn't always used, in tends to be the most + /// commonly use regex engine in default configurations. It tends to + /// adopt the performance profile of a fully build DFA, but without the + /// downside of taking worst case exponential time to build. + /// + /// The downside is that it needs to keep a cache of transitions and + /// states that are built while running a search, and this cache + /// can fill up. When it fills up, the cache will reset itself. Any + /// previously generated states and transitions will then need to be + /// re-generated. If this happens too many times, then this library + /// will bail out of using the lazy DFA and switch to a different regex + /// engine. + /// + /// If your regex provokes this particular downside of the lazy DFA, + /// then it may be beneficial to increase its cache capacity. This will + /// potentially reduce the frequency of cache resetting (ideally to + /// `0`). While it won't fix all potential performance problems with + /// the lazy DFA, increasing the cache capacity does fix some. + /// + /// There is no easy way to determine, a priori, whether increasing + /// this cache capacity will help. In general, the larger your regex, + /// the more cache it's likely to use. But that isn't an ironclad rule. + /// For example, a regex like `[01]*1[01]{N}` would normally produce a + /// fully build DFA that is exponential in size with respect to `N`. + /// The lazy DFA will prevent exponential space blow-up, but it cache + /// is likely to fill up, even when it's large and even for smallish + /// values of `N`. + /// + /// If you aren't sure whether this helps or not, it is sensible to + /// set this to some arbitrarily large number in testing, such as + /// `usize::MAX`. Namely, this represents the amount of capacity that + /// *may* be used. It's probably not a good idea to use `usize::MAX` in + /// production though, since it implies there are no controls on heap + /// memory used by this library during a search. In effect, set it to + /// whatever you're willing to allocate for a single regex search. + pub fn dfa_size_limit( + &mut self, + bytes: usize, + ) -> &mut RegexSetBuilder { + self.builder.dfa_size_limit(bytes); + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is + /// allowed to be. If the AST exceeds the given limit (e.g., with too + /// many nested groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an AST using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire AST is parsed. + /// Therefore, if callers want to put a limit on the amount of heap + /// space used, then they should impose a limit on the length, in + /// bytes, of the concrete pattern string. In particular, this is + /// viable since this parser implementation will limit itself to heap + /// space proportional to the length of the pattern string. See also + /// the [untrusted inputs](crate#untrusted-input) section in the + /// top-level crate documentation for more information about this. + /// + /// Note that a nest limit of `0` will return a nest limit error for + /// most patterns but not all. For example, a nest limit of `0` permits + /// `a` but not `ab`, since `ab` requires an explicit concatenation, + /// which results in a nest depth of `1`. In general, a nest limit is + /// not something that manifests in an obvious way in the concrete + /// syntax, therefore, it should not be used in a granular way. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSetBuilder; + /// + /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok()); + /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err()); + /// ``` + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { + self.builder.nest_limit(limit); + self + } + } +} + +pub(crate) mod bytes { + use crate::{ + bytes::{Regex, RegexSet}, + error::Error, + }; + + use super::Builder; + + /// A configurable builder for a [`Regex`]. + /// + /// This builder can be used to programmatically set flags such as `i` + /// (case insensitive) and `x` (for verbose mode). This builder can also be + /// used to configure things like the line terminator and a size limit on + /// the compiled regular expression. + #[derive(Clone, Debug)] + pub struct RegexBuilder { + builder: Builder, + } + + impl RegexBuilder { + /// Create a new builder with a default configuration for the given + /// pattern. + /// + /// If the pattern is invalid or exceeds the configured size limits, + /// then an error will be returned when [`RegexBuilder::build`] is + /// called. + pub fn new(pattern: &str) -> RegexBuilder { + RegexBuilder { builder: Builder::new([pattern]) } + } + + /// Compiles the pattern given to `RegexBuilder::new` with the + /// configuration set on this builder. + /// + /// If the pattern isn't a valid regex or if a configured size limit + /// was exceeded, then an error is returned. + pub fn build(&self) -> Result<Regex, Error> { + self.builder.build_one_bytes() + } + + /// This configures Unicode mode for the entire pattern. + /// + /// Enabling Unicode mode does a number of things: + /// + /// * Most fundamentally, it causes the fundamental atom of matching + /// to be a single codepoint. When Unicode mode is disabled, it's a + /// single byte. For example, when Unicode mode is enabled, `.` will + /// match `💩` once, where as it will match 4 times when Unicode mode + /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) + /// * Case insensitive matching uses Unicode simple case folding rules. + /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are + /// available. + /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and + /// `\d`. + /// * The word boundary assertions, `\b` and `\B`, use the Unicode + /// definition of a word character. + /// + /// Note that unlike the top-level `Regex` for searching `&str`, it + /// is permitted to disable Unicode mode even if the resulting pattern + /// could match invalid UTF-8. For example, `(?-u:.)` is not a valid + /// pattern for a top-level `Regex`, but is valid for a `bytes::Regex`. + /// + /// For more details on the Unicode support in this crate, see the + /// [Unicode section](crate#unicode) in this crate's top-level + /// documentation. + /// + /// The default for this is `true`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"\w") + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally greek letters would be included in \w, but since + /// // Unicode mode is disabled, it only matches ASCII letters. + /// assert!(!re.is_match("δ".as_bytes())); + /// + /// let re = RegexBuilder::new(r"s") + /// .case_insensitive(true) + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally 'ſ' is included when searching for 's' case + /// // insensitively due to Unicode's simple case folding rules. But + /// // when Unicode mode is disabled, only ASCII case insensitive rules + /// // are used. + /// assert!(!re.is_match("ſ".as_bytes())); + /// ``` + /// + /// Since this builder is for constructing a [`bytes::Regex`](Regex), + /// one can disable Unicode mode even if it would match invalid UTF-8: + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r".") + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally greek letters would be included in \w, but since + /// // Unicode mode is disabled, it only matches ASCII letters. + /// assert!(re.is_match(b"\xFF")); + /// ``` + pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.unicode(yes); + self + } + + /// This configures whether to enable case insensitive matching for the + /// entire pattern. + /// + /// This setting can also be configured using the inline flag `i` + /// in the pattern. For example, `(?i:foo)` matches `foo` case + /// insensitively while `(?-i:foo)` matches `foo` case sensitively. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") + /// .case_insensitive(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"FoObarQuUx")); + /// // Even though case insensitive matching is enabled in the builder, + /// // it can be locally disabled within the pattern. In this case, + /// // `bar` is matched case sensitively. + /// assert!(!re.is_match(b"fooBARquux")); + /// ``` + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.case_insensitive(yes); + self + } + + /// This configures multi-line mode for the entire pattern. + /// + /// Enabling multi-line mode changes the behavior of the `^` and `$` + /// anchor assertions. Instead of only matching at the beginning and + /// end of a haystack, respectively, multi-line mode causes them to + /// match at the beginning and end of a line *in addition* to the + /// beginning and end of a haystack. More precisely, `^` will match at + /// the position immediately following a `\n` and `$` will match at the + /// position immediately preceding a `\n`. + /// + /// The behavior of this option can be impacted by other settings too: + /// + /// * The [`RegexBuilder::line_terminator`] option changes `\n` above + /// to any ASCII byte. + /// * The [`RegexBuilder::crlf`] option changes the line terminator to + /// be either `\r` or `\n`, but never at the position between a `\r` + /// and `\n`. + /// + /// This setting can also be configured using the inline flag `m` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .build() + /// .unwrap(); + /// assert_eq!(Some(1..4), re.find(b"\nfoo\n").map(|m| m.range())); + /// ``` + pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.multi_line(yes); + self + } + + /// This configures dot-matches-new-line mode for the entire pattern. + /// + /// Perhaps surprisingly, the default behavior for `.` is not to match + /// any character, but rather, to match any character except for the + /// line terminator (which is `\n` by default). When this mode is + /// enabled, the behavior changes such that `.` truly matches any + /// character. + /// + /// This setting can also be configured using the inline flag `s` in + /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent + /// regexes. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"foo.bar") + /// .dot_matches_new_line(true) + /// .build() + /// .unwrap(); + /// let hay = b"foo\nbar"; + /// assert_eq!(Some(&b"foo\nbar"[..]), re.find(hay).map(|m| m.as_bytes())); + /// ``` + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexBuilder { + self.builder.dot_matches_new_line(yes); + self + } + + /// This configures CRLF mode for the entire pattern. + /// + /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for + /// short) and `\n` ("line feed" or LF for short) are treated as line + /// terminators. This results in the following: + /// + /// * Unless dot-matches-new-line mode is enabled, `.` will now match + /// any character except for `\n` and `\r`. + /// * When multi-line mode is enabled, `^` will match immediatelly + /// following a `\n` or a `\r`. Similarly, `$` will match immediately + /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match + /// between `\r` and `\n`. + /// + /// This setting can also be configured using the inline flag `R` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = b"\r\nfoo\r\n"; + /// // If CRLF mode weren't enabled here, then '$' wouldn't match + /// // immediately after 'foo', and thus no match would be found. + /// assert_eq!(Some(&b"foo"[..]), re.find(hay).map(|m| m.as_bytes())); + /// ``` + /// + /// This example demonstrates that `^` will never match at a position + /// between `\r` and `\n`. (`$` will similarly not match between a `\r` + /// and a `\n`.) + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^") + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = b"\r\n\r\n"; + /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); + /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); + /// ``` + pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.crlf(yes); + self + } + + /// Configures the line terminator to be used by the regex. + /// + /// The line terminator is relevant in two ways for a particular regex: + /// + /// * When dot-matches-new-line mode is *not* enabled (the default), + /// then `.` will match any character except for the configured line + /// terminator. + /// * When multi-line mode is enabled (not the default), then `^` and + /// `$` will match immediately after and before, respectively, a line + /// terminator. + /// + /// In both cases, if CRLF mode is enabled in a particular context, + /// then it takes precedence over any configured line terminator. + /// + /// This option cannot be configured from within the pattern. + /// + /// The default line terminator is `\n`. + /// + /// # Example + /// + /// This shows how to treat the NUL byte as a line terminator. This can + /// be a useful heuristic when searching binary data. + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"^foo$") + /// .multi_line(true) + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// let hay = b"\x00foo\x00"; + /// assert_eq!(Some(1..4), re.find(hay).map(|m| m.range())); + /// ``` + /// + /// This example shows that the behavior of `.` is impacted by this + /// setting as well: + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r".") + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"\n")); + /// assert!(!re.is_match(b"\x00")); + /// ``` + /// + /// This shows that building a regex will work even when the byte + /// given is not ASCII. This is unlike the top-level `Regex` API where + /// matching invalid UTF-8 is not allowed. + /// + /// Note though that you must disable Unicode mode. This is required + /// because Unicode mode requires matching one codepoint at a time, + /// and there is no way to match a non-ASCII byte as if it were a + /// codepoint. + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// assert!( + /// RegexBuilder::new(r".") + /// .unicode(false) + /// .line_terminator(0x80) + /// .build() + /// .is_ok(), + /// ); + /// ``` + pub fn line_terminator(&mut self, byte: u8) -> &mut RegexBuilder { + self.builder.line_terminator(byte); + self + } + + /// This configures swap-greed mode for the entire pattern. + /// + /// When swap-greed mode is enabled, patterns like `a+` will become + /// non-greedy and patterns like `a+?` will become greedy. In other + /// words, the meanings of `a+` and `a+?` are switched. + /// + /// This setting can also be configured using the inline flag `U` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let re = RegexBuilder::new(r"a+") + /// .swap_greed(true) + /// .build() + /// .unwrap(); + /// assert_eq!(Some(&b"a"[..]), re.find(b"aaa").map(|m| m.as_bytes())); + /// ``` + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.swap_greed(yes); + self + } + + /// This configures verbose mode for the entire pattern. + /// + /// When enabled, whitespace will treated as insignifcant in the + /// pattern and `#` can be used to start a comment until the next new + /// line. + /// + /// Normally, in most places in a pattern, whitespace is treated + /// literally. For example ` +` will match one or more ASCII whitespace + /// characters. + /// + /// When verbose mode is enabled, `\#` can be used to match a literal + /// `#` and `\ ` can be used to match a literal ASCII whitespace + /// character. + /// + /// Verbose mode is useful for permitting regexes to be formatted and + /// broken up more nicely. This may make them more easily readable. + /// + /// This setting can also be configured using the inline flag `x` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// let pat = r" + /// \b + /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter + /// [\s--\n]+ # whitespace should separate names + /// (?: # middle name can be an initial! + /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) + /// [\s--\n]+ + /// )? + /// (?<last>\p{Uppercase}\w*) + /// \b + /// "; + /// let re = RegexBuilder::new(pat) + /// .ignore_whitespace(true) + /// .build() + /// .unwrap(); + /// + /// let caps = re.captures(b"Harry Potter").unwrap(); + /// assert_eq!(&b"Harry"[..], &caps["first"]); + /// assert_eq!(&b"Potter"[..], &caps["last"]); + /// + /// let caps = re.captures(b"Harry J. Potter").unwrap(); + /// assert_eq!(&b"Harry"[..], &caps["first"]); + /// // Since a middle name/initial isn't required for an overall match, + /// // we can't assume that 'initial' or 'middle' will be populated! + /// assert_eq!( + /// Some(&b"J"[..]), + /// caps.name("initial").map(|m| m.as_bytes()), + /// ); + /// assert_eq!(None, caps.name("middle").map(|m| m.as_bytes())); + /// assert_eq!(&b"Potter"[..], &caps["last"]); + /// + /// let caps = re.captures(b"Harry James Potter").unwrap(); + /// assert_eq!(&b"Harry"[..], &caps["first"]); + /// // Since a middle name/initial isn't required for an overall match, + /// // we can't assume that 'initial' or 'middle' will be populated! + /// assert_eq!(None, caps.name("initial").map(|m| m.as_bytes())); + /// assert_eq!( + /// Some(&b"James"[..]), + /// caps.name("middle").map(|m| m.as_bytes()), + /// ); + /// assert_eq!(&b"Potter"[..], &caps["last"]); + /// ``` + pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.ignore_whitespace(yes); + self + } + + /// This configures octal mode for the entire pattern. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints + /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all + /// equivalent patterns, where the last example shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, + /// it does make good error messages harder. That is, in PCRE based + /// regex engines, syntax like `\1` invokes a backreference, which is + /// explicitly unsupported this library. However, many users expect + /// backreferences to be supported. Therefore, when octal support + /// is disabled, the error message will explicitly mention that + /// backreferences aren't supported. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// // Normally this pattern would not compile, with an error message + /// // about backreferences not being supported. But with octal mode + /// // enabled, octal escape sequences work. + /// let re = RegexBuilder::new(r"\141") + /// .octal(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"a")); + /// ``` + pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { + self.builder.octal(yes); + self + } + + /// Sets the approximate size limit, in bytes, of the compiled regex. + /// + /// This roughly corresponds to the number of heap memory, in + /// bytes, occupied by a single regex. If the regex would otherwise + /// approximately exceed this limit, then compiling that regex will + /// fail. + /// + /// The main utility of a method like this is to avoid compiling + /// regexes that use an unexpected amount of resources, such as + /// time and memory. Even if the memory usage of a large regex is + /// acceptable, its search time may not be. Namely, worst case time + /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and + /// `n ~ len(haystack)`. That is, search time depends, in part, on the + /// size of the compiled regex. This means that putting a limit on the + /// size of the regex limits how much a regex can impact search time. + /// + /// For more information about regex size limits, see the section on + /// [untrusted inputs](crate#untrusted-input) in the top-level crate + /// documentation. + /// + /// The default for this is some reasonable number that permits most + /// patterns to compile successfully. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// // It may surprise you how big some seemingly small patterns can + /// // be! Since \w is Unicode aware, this generates a regex that can + /// // match approximately 140,000 distinct codepoints. + /// assert!(RegexBuilder::new(r"\w").size_limit(45_000).build().is_err()); + /// ``` + pub fn size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { + self.builder.size_limit(bytes); + self + } + + /// Set the approximate capacity, in bytes, of the cache of transitions + /// used by the lazy DFA. + /// + /// While the lazy DFA isn't always used, in tends to be the most + /// commonly use regex engine in default configurations. It tends to + /// adopt the performance profile of a fully build DFA, but without the + /// downside of taking worst case exponential time to build. + /// + /// The downside is that it needs to keep a cache of transitions and + /// states that are built while running a search, and this cache + /// can fill up. When it fills up, the cache will reset itself. Any + /// previously generated states and transitions will then need to be + /// re-generated. If this happens too many times, then this library + /// will bail out of using the lazy DFA and switch to a different regex + /// engine. + /// + /// If your regex provokes this particular downside of the lazy DFA, + /// then it may be beneficial to increase its cache capacity. This will + /// potentially reduce the frequency of cache resetting (ideally to + /// `0`). While it won't fix all potential performance problems with + /// the lazy DFA, increasing the cache capacity does fix some. + /// + /// There is no easy way to determine, a priori, whether increasing + /// this cache capacity will help. In general, the larger your regex, + /// the more cache it's likely to use. But that isn't an ironclad rule. + /// For example, a regex like `[01]*1[01]{N}` would normally produce a + /// fully build DFA that is exponential in size with respect to `N`. + /// The lazy DFA will prevent exponential space blow-up, but it cache + /// is likely to fill up, even when it's large and even for smallish + /// values of `N`. + /// + /// If you aren't sure whether this helps or not, it is sensible to + /// set this to some arbitrarily large number in testing, such as + /// `usize::MAX`. Namely, this represents the amount of capacity that + /// *may* be used. It's probably not a good idea to use `usize::MAX` in + /// production though, since it implies there are no controls on heap + /// memory used by this library during a search. In effect, set it to + /// whatever you're willing to allocate for a single regex search. + pub fn dfa_size_limit(&mut self, bytes: usize) -> &mut RegexBuilder { + self.builder.dfa_size_limit(bytes); + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is + /// allowed to be. If the AST exceeds the given limit (e.g., with too + /// many nested groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an AST using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire AST is parsed. + /// Therefore, if callers want to put a limit on the amount of heap + /// space used, then they should impose a limit on the length, in + /// bytes, of the concrete pattern string. In particular, this is + /// viable since this parser implementation will limit itself to heap + /// space proportional to the length of the pattern string. See also + /// the [untrusted inputs](crate#untrusted-input) section in the + /// top-level crate documentation for more information about this. + /// + /// Note that a nest limit of `0` will return a nest limit error for + /// most patterns but not all. For example, a nest limit of `0` permits + /// `a` but not `ab`, since `ab` requires an explicit concatenation, + /// which results in a nest depth of `1`. In general, a nest limit is + /// not something that manifests in an obvious way in the concrete + /// syntax, therefore, it should not be used in a granular way. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexBuilder; + /// + /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); + /// assert!(RegexBuilder::new(r"ab").nest_limit(0).build().is_err()); + /// ``` + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { + self.builder.nest_limit(limit); + self + } + } + + /// A configurable builder for a [`RegexSet`]. + /// + /// This builder can be used to programmatically set flags such as `i` + /// (case insensitive) and `x` (for verbose mode). This builder can also be + /// used to configure things like the line terminator and a size limit on + /// the compiled regular expression. + #[derive(Clone, Debug)] + pub struct RegexSetBuilder { + builder: Builder, + } + + impl RegexSetBuilder { + /// Create a new builder with a default configuration for the given + /// patterns. + /// + /// If the patterns are invalid or exceed the configured size limits, + /// then an error will be returned when [`RegexSetBuilder::build`] is + /// called. + pub fn new<I, S>(patterns: I) -> RegexSetBuilder + where + I: IntoIterator<Item = S>, + S: AsRef<str>, + { + RegexSetBuilder { builder: Builder::new(patterns) } + } + + /// Compiles the patterns given to `RegexSetBuilder::new` with the + /// configuration set on this builder. + /// + /// If the patterns aren't valid regexes or if a configured size limit + /// was exceeded, then an error is returned. + pub fn build(&self) -> Result<RegexSet, Error> { + self.builder.build_many_bytes() + } + + /// This configures Unicode mode for the all of the patterns. + /// + /// Enabling Unicode mode does a number of things: + /// + /// * Most fundamentally, it causes the fundamental atom of matching + /// to be a single codepoint. When Unicode mode is disabled, it's a + /// single byte. For example, when Unicode mode is enabled, `.` will + /// match `💩` once, where as it will match 4 times when Unicode mode + /// is disabled. (Since the UTF-8 encoding of `💩` is 4 bytes long.) + /// * Case insensitive matching uses Unicode simple case folding rules. + /// * Unicode character classes like `\p{Letter}` and `\p{Greek}` are + /// available. + /// * Perl character classes are Unicode aware. That is, `\w`, `\s` and + /// `\d`. + /// * The word boundary assertions, `\b` and `\B`, use the Unicode + /// definition of a word character. + /// + /// Note that unlike the top-level `RegexSet` for searching `&str`, + /// it is permitted to disable Unicode mode even if the resulting + /// pattern could match invalid UTF-8. For example, `(?-u:.)` is not + /// a valid pattern for a top-level `RegexSet`, but is valid for a + /// `bytes::RegexSet`. + /// + /// For more details on the Unicode support in this crate, see the + /// [Unicode section](crate#unicode) in this crate's top-level + /// documentation. + /// + /// The default for this is `true`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"\w"]) + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally greek letters would be included in \w, but since + /// // Unicode mode is disabled, it only matches ASCII letters. + /// assert!(!re.is_match("δ".as_bytes())); + /// + /// let re = RegexSetBuilder::new([r"s"]) + /// .case_insensitive(true) + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally 'ſ' is included when searching for 's' case + /// // insensitively due to Unicode's simple case folding rules. But + /// // when Unicode mode is disabled, only ASCII case insensitive rules + /// // are used. + /// assert!(!re.is_match("ſ".as_bytes())); + /// ``` + /// + /// Since this builder is for constructing a + /// [`bytes::RegexSet`](RegexSet), one can disable Unicode mode even if + /// it would match invalid UTF-8: + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"."]) + /// .unicode(false) + /// .build() + /// .unwrap(); + /// // Normally greek letters would be included in \w, but since + /// // Unicode mode is disabled, it only matches ASCII letters. + /// assert!(re.is_match(b"\xFF")); + /// ``` + pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.unicode(yes); + self + } + + /// This configures whether to enable case insensitive matching for all + /// of the patterns. + /// + /// This setting can also be configured using the inline flag `i` + /// in the pattern. For example, `(?i:foo)` matches `foo` case + /// insensitively while `(?-i:foo)` matches `foo` case sensitively. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"foo(?-i:bar)quux"]) + /// .case_insensitive(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"FoObarQuUx")); + /// // Even though case insensitive matching is enabled in the builder, + /// // it can be locally disabled within the pattern. In this case, + /// // `bar` is matched case sensitively. + /// assert!(!re.is_match(b"fooBARquux")); + /// ``` + pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.case_insensitive(yes); + self + } + + /// This configures multi-line mode for all of the patterns. + /// + /// Enabling multi-line mode changes the behavior of the `^` and `$` + /// anchor assertions. Instead of only matching at the beginning and + /// end of a haystack, respectively, multi-line mode causes them to + /// match at the beginning and end of a line *in addition* to the + /// beginning and end of a haystack. More precisely, `^` will match at + /// the position immediately following a `\n` and `$` will match at the + /// position immediately preceding a `\n`. + /// + /// The behavior of this option can be impacted by other settings too: + /// + /// * The [`RegexSetBuilder::line_terminator`] option changes `\n` + /// above to any ASCII byte. + /// * The [`RegexSetBuilder::crlf`] option changes the line terminator + /// to be either `\r` or `\n`, but never at the position between a `\r` + /// and `\n`. + /// + /// This setting can also be configured using the inline flag `m` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^foo$"]) + /// .multi_line(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"\nfoo\n")); + /// ``` + pub fn multi_line(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.multi_line(yes); + self + } + + /// This configures dot-matches-new-line mode for the entire pattern. + /// + /// Perhaps surprisingly, the default behavior for `.` is not to match + /// any character, but rather, to match any character except for the + /// line terminator (which is `\n` by default). When this mode is + /// enabled, the behavior changes such that `.` truly matches any + /// character. + /// + /// This setting can also be configured using the inline flag `s` in + /// the pattern. For example, `(?s:.)` and `\p{any}` are equivalent + /// regexes. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"foo.bar"]) + /// .dot_matches_new_line(true) + /// .build() + /// .unwrap(); + /// let hay = b"foo\nbar"; + /// assert!(re.is_match(hay)); + /// ``` + pub fn dot_matches_new_line( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.builder.dot_matches_new_line(yes); + self + } + + /// This configures CRLF mode for all of the patterns. + /// + /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for + /// short) and `\n` ("line feed" or LF for short) are treated as line + /// terminators. This results in the following: + /// + /// * Unless dot-matches-new-line mode is enabled, `.` will now match + /// any character except for `\n` and `\r`. + /// * When multi-line mode is enabled, `^` will match immediatelly + /// following a `\n` or a `\r`. Similarly, `$` will match immediately + /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match + /// between `\r` and `\n`. + /// + /// This setting can also be configured using the inline flag `R` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^foo$"]) + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// let hay = b"\r\nfoo\r\n"; + /// // If CRLF mode weren't enabled here, then '$' wouldn't match + /// // immediately after 'foo', and thus no match would be found. + /// assert!(re.is_match(hay)); + /// ``` + /// + /// This example demonstrates that `^` will never match at a position + /// between `\r` and `\n`. (`$` will similarly not match between a `\r` + /// and a `\n`.) + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^\n"]) + /// .multi_line(true) + /// .crlf(true) + /// .build() + /// .unwrap(); + /// assert!(!re.is_match(b"\r\n")); + /// ``` + pub fn crlf(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.crlf(yes); + self + } + + /// Configures the line terminator to be used by the regex. + /// + /// The line terminator is relevant in two ways for a particular regex: + /// + /// * When dot-matches-new-line mode is *not* enabled (the default), + /// then `.` will match any character except for the configured line + /// terminator. + /// * When multi-line mode is enabled (not the default), then `^` and + /// `$` will match immediately after and before, respectively, a line + /// terminator. + /// + /// In both cases, if CRLF mode is enabled in a particular context, + /// then it takes precedence over any configured line terminator. + /// + /// This option cannot be configured from within the pattern. + /// + /// The default line terminator is `\n`. + /// + /// # Example + /// + /// This shows how to treat the NUL byte as a line terminator. This can + /// be a useful heuristic when searching binary data. + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"^foo$"]) + /// .multi_line(true) + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// let hay = b"\x00foo\x00"; + /// assert!(re.is_match(hay)); + /// ``` + /// + /// This example shows that the behavior of `.` is impacted by this + /// setting as well: + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let re = RegexSetBuilder::new([r"."]) + /// .line_terminator(b'\x00') + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"\n")); + /// assert!(!re.is_match(b"\x00")); + /// ``` + /// + /// This shows that building a regex will work even when the byte given + /// is not ASCII. This is unlike the top-level `RegexSet` API where + /// matching invalid UTF-8 is not allowed. + /// + /// Note though that you must disable Unicode mode. This is required + /// because Unicode mode requires matching one codepoint at a time, + /// and there is no way to match a non-ASCII byte as if it were a + /// codepoint. + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// assert!( + /// RegexSetBuilder::new([r"."]) + /// .unicode(false) + /// .line_terminator(0x80) + /// .build() + /// .is_ok(), + /// ); + /// ``` + pub fn line_terminator(&mut self, byte: u8) -> &mut RegexSetBuilder { + self.builder.line_terminator(byte); + self + } + + /// This configures swap-greed mode for all of the patterns. + /// + /// When swap-greed mode is enabled, patterns like `a+` will become + /// non-greedy and patterns like `a+?` will become greedy. In other + /// words, the meanings of `a+` and `a+?` are switched. + /// + /// This setting can also be configured using the inline flag `U` in + /// the pattern. + /// + /// Note that this is generally not useful for a `RegexSet` since a + /// `RegexSet` can only report whether a pattern matches or not. Since + /// greediness never impacts whether a match is found or not (only the + /// offsets of the match), it follows that whether parts of a pattern + /// are greedy or not doesn't matter for a `RegexSet`. + /// + /// The default for this is `false`. + pub fn swap_greed(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.swap_greed(yes); + self + } + + /// This configures verbose mode for all of the patterns. + /// + /// When enabled, whitespace will treated as insignifcant in the + /// pattern and `#` can be used to start a comment until the next new + /// line. + /// + /// Normally, in most places in a pattern, whitespace is treated + /// literally. For example ` +` will match one or more ASCII whitespace + /// characters. + /// + /// When verbose mode is enabled, `\#` can be used to match a literal + /// `#` and `\ ` can be used to match a literal ASCII whitespace + /// character. + /// + /// Verbose mode is useful for permitting regexes to be formatted and + /// broken up more nicely. This may make them more easily readable. + /// + /// This setting can also be configured using the inline flag `x` in + /// the pattern. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// let pat = r" + /// \b + /// (?<first>\p{Uppercase}\w*) # always start with uppercase letter + /// [\s--\n]+ # whitespace should separate names + /// (?: # middle name can be an initial! + /// (?:(?<initial>\p{Uppercase})\.|(?<middle>\p{Uppercase}\w*)) + /// [\s--\n]+ + /// )? + /// (?<last>\p{Uppercase}\w*) + /// \b + /// "; + /// let re = RegexSetBuilder::new([pat]) + /// .ignore_whitespace(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"Harry Potter")); + /// assert!(re.is_match(b"Harry J. Potter")); + /// assert!(re.is_match(b"Harry James Potter")); + /// assert!(!re.is_match(b"harry J. Potter")); + /// ``` + pub fn ignore_whitespace( + &mut self, + yes: bool, + ) -> &mut RegexSetBuilder { + self.builder.ignore_whitespace(yes); + self + } + + /// This configures octal mode for all of the patterns. + /// + /// Octal syntax is a little-known way of uttering Unicode codepoints + /// in a pattern. For example, `a`, `\x61`, `\u0061` and `\141` are all + /// equivalent patterns, where the last example shows octal syntax. + /// + /// While supporting octal syntax isn't in and of itself a problem, + /// it does make good error messages harder. That is, in PCRE based + /// regex engines, syntax like `\1` invokes a backreference, which is + /// explicitly unsupported this library. However, many users expect + /// backreferences to be supported. Therefore, when octal support + /// is disabled, the error message will explicitly mention that + /// backreferences aren't supported. + /// + /// The default for this is `false`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// // Normally this pattern would not compile, with an error message + /// // about backreferences not being supported. But with octal mode + /// // enabled, octal escape sequences work. + /// let re = RegexSetBuilder::new([r"\141"]) + /// .octal(true) + /// .build() + /// .unwrap(); + /// assert!(re.is_match(b"a")); + /// ``` + pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { + self.builder.octal(yes); + self + } + + /// Sets the approximate size limit, in bytes, of the compiled regex. + /// + /// This roughly corresponds to the number of heap memory, in + /// bytes, occupied by a single regex. If the regex would otherwise + /// approximately exceed this limit, then compiling that regex will + /// fail. + /// + /// The main utility of a method like this is to avoid compiling + /// regexes that use an unexpected amount of resources, such as + /// time and memory. Even if the memory usage of a large regex is + /// acceptable, its search time may not be. Namely, worst case time + /// complexity for search is `O(m * n)`, where `m ~ len(pattern)` and + /// `n ~ len(haystack)`. That is, search time depends, in part, on the + /// size of the compiled regex. This means that putting a limit on the + /// size of the regex limits how much a regex can impact search time. + /// + /// For more information about regex size limits, see the section on + /// [untrusted inputs](crate#untrusted-input) in the top-level crate + /// documentation. + /// + /// The default for this is some reasonable number that permits most + /// patterns to compile successfully. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// // It may surprise you how big some seemingly small patterns can + /// // be! Since \w is Unicode aware, this generates a regex that can + /// // match approximately 140,000 distinct codepoints. + /// assert!( + /// RegexSetBuilder::new([r"\w"]) + /// .size_limit(45_000) + /// .build() + /// .is_err() + /// ); + /// ``` + pub fn size_limit(&mut self, bytes: usize) -> &mut RegexSetBuilder { + self.builder.size_limit(bytes); + self + } + + /// Set the approximate capacity, in bytes, of the cache of transitions + /// used by the lazy DFA. + /// + /// While the lazy DFA isn't always used, in tends to be the most + /// commonly use regex engine in default configurations. It tends to + /// adopt the performance profile of a fully build DFA, but without the + /// downside of taking worst case exponential time to build. + /// + /// The downside is that it needs to keep a cache of transitions and + /// states that are built while running a search, and this cache + /// can fill up. When it fills up, the cache will reset itself. Any + /// previously generated states and transitions will then need to be + /// re-generated. If this happens too many times, then this library + /// will bail out of using the lazy DFA and switch to a different regex + /// engine. + /// + /// If your regex provokes this particular downside of the lazy DFA, + /// then it may be beneficial to increase its cache capacity. This will + /// potentially reduce the frequency of cache resetting (ideally to + /// `0`). While it won't fix all potential performance problems with + /// the lazy DFA, increasing the cache capacity does fix some. + /// + /// There is no easy way to determine, a priori, whether increasing + /// this cache capacity will help. In general, the larger your regex, + /// the more cache it's likely to use. But that isn't an ironclad rule. + /// For example, a regex like `[01]*1[01]{N}` would normally produce a + /// fully build DFA that is exponential in size with respect to `N`. + /// The lazy DFA will prevent exponential space blow-up, but it cache + /// is likely to fill up, even when it's large and even for smallish + /// values of `N`. + /// + /// If you aren't sure whether this helps or not, it is sensible to + /// set this to some arbitrarily large number in testing, such as + /// `usize::MAX`. Namely, this represents the amount of capacity that + /// *may* be used. It's probably not a good idea to use `usize::MAX` in + /// production though, since it implies there are no controls on heap + /// memory used by this library during a search. In effect, set it to + /// whatever you're willing to allocate for a single regex search. + pub fn dfa_size_limit( + &mut self, + bytes: usize, + ) -> &mut RegexSetBuilder { + self.builder.dfa_size_limit(bytes); + self + } + + /// Set the nesting limit for this parser. + /// + /// The nesting limit controls how deep the abstract syntax tree is + /// allowed to be. If the AST exceeds the given limit (e.g., with too + /// many nested groups), then an error is returned by the parser. + /// + /// The purpose of this limit is to act as a heuristic to prevent stack + /// overflow for consumers that do structural induction on an AST using + /// explicit recursion. While this crate never does this (instead using + /// constant stack space and moving the call stack to the heap), other + /// crates may. + /// + /// This limit is not checked until the entire AST is parsed. + /// Therefore, if callers want to put a limit on the amount of heap + /// space used, then they should impose a limit on the length, in + /// bytes, of the concrete pattern string. In particular, this is + /// viable since this parser implementation will limit itself to heap + /// space proportional to the length of the pattern string. See also + /// the [untrusted inputs](crate#untrusted-input) section in the + /// top-level crate documentation for more information about this. + /// + /// Note that a nest limit of `0` will return a nest limit error for + /// most patterns but not all. For example, a nest limit of `0` permits + /// `a` but not `ab`, since `ab` requires an explicit concatenation, + /// which results in a nest depth of `1`. In general, a nest limit is + /// not something that manifests in an obvious way in the concrete + /// syntax, therefore, it should not be used in a granular way. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSetBuilder; + /// + /// assert!(RegexSetBuilder::new([r"a"]).nest_limit(0).build().is_ok()); + /// assert!(RegexSetBuilder::new([r"ab"]).nest_limit(0).build().is_err()); + /// ``` + pub fn nest_limit(&mut self, limit: u32) -> &mut RegexSetBuilder { + self.builder.nest_limit(limit); + self + } + } +} diff --git a/src/bytes.rs b/src/bytes.rs new file mode 100644 index 0000000000..c81c1a43d1 --- /dev/null +++ b/src/bytes.rs @@ -0,0 +1,91 @@ +/*! +Search for regex matches in `&[u8]` haystacks. + +This module provides a nearly identical API via [`Regex`] to the one found in +the top-level of this crate. There are two important differences: + +1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>` +is used where `String` would have been used in the top-level API. +2. Unicode support can be disabled even when disabling it would result in +matching invalid UTF-8 bytes. + +# Example: match null terminated string + +This shows how to find all null-terminated strings in a slice of bytes. This +works even if a C string contains invalid UTF-8. + +```rust +use regex::bytes::Regex; + +let re = Regex::new(r"(?-u)(?<cstr>[^\x00]+)\x00").unwrap(); +let hay = b"foo\x00qu\xFFux\x00baz\x00"; + +// Extract all of the strings without the NUL terminator from each match. +// The unwrap is OK here since a match requires the `cstr` capture to match. +let cstrs: Vec<&[u8]> = + re.captures_iter(hay) + .map(|c| c.name("cstr").unwrap().as_bytes()) + .collect(); +assert_eq!(cstrs, vec![&b"foo"[..], &b"qu\xFFux"[..], &b"baz"[..]]); +``` + +# Example: selectively enable Unicode support + +This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded +string (e.g., to extract a title from a Matroska file): + +```rust +use regex::bytes::Regex; + +let re = Regex::new( + r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" +).unwrap(); +let hay = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; + +// Notice that despite the `.*` at the end, it will only match valid UTF-8 +// because Unicode mode was enabled with the `u` flag. Without the `u` flag, +// the `.*` would match the rest of the bytes regardless of whehter they were +// valid UTF-8. +let (_, [title]) = re.captures(hay).unwrap().extract(); +assert_eq!(title, b"\xE2\x98\x83"); +// We can UTF-8 decode the title now. And the unwrap here +// is correct because the existence of a match guarantees +// that `title` is valid UTF-8. +let title = std::str::from_utf8(title).unwrap(); +assert_eq!(title, "☃"); +``` + +In general, if the Unicode flag is enabled in a capture group and that capture +is part of the overall match, then the capture is *guaranteed* to be valid +UTF-8. + +# Syntax + +The supported syntax is pretty much the same as the syntax for Unicode +regular expressions with a few changes that make sense for matching arbitrary +bytes: + +1. The `u` flag can be disabled even when disabling it might cause the regex to +match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in +"ASCII compatible" mode. +2. In ASCII compatible mode, neither Unicode scalar values nor Unicode +character classes are allowed. +3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) +revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps +to `[[:digit:]]` and `\s` maps to `[[:space:]]`. +4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to +determine whether a byte is a word byte or not. +5. Hexadecimal notation can be used to specify arbitrary bytes instead of +Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the +literal byte `\xFF`, while in Unicode mode, `\xFF` is the Unicode codepoint +`U+00FF` that matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal +notation when enabled. +6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the +`s` flag is additionally enabled, `.` matches any byte. + +# Performance + +In general, one should expect performance on `&[u8]` to be roughly similar to +performance on `&str`. +*/ +pub use crate::{builders::bytes::*, regex::bytes::*, regexset::bytes::*}; diff --git a/src/error.rs b/src/error.rs index 13e32d56d1..6026b3849d 100644 --- a/src/error.rs +++ b/src/error.rs @@ -1,3 +1,5 @@ +use alloc::string::{String, ToString}; + use regex_automata::meta; /// An error that occurred during parsing or compiling a regular expression. @@ -51,6 +53,7 @@ impl Error { } } +#[cfg(feature = "std")] impl std::error::Error for Error { // TODO: Remove this method entirely on the next breaking semver release. #[allow(deprecated)] @@ -62,8 +65,8 @@ impl std::error::Error for Error { } } -impl std::fmt::Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Display for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { Error::Syntax(ref err) => err.fmt(f), Error::CompiledTooBig(limit) => write!( @@ -79,8 +82,8 @@ impl std::fmt::Display for Error { // errors when people use `Regex::new(...).unwrap()`. It's a little weird, // but the `Syntax` variant is already storing a `String` anyway, so we might // as well format it nicely. -impl std::fmt::Debug for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +impl core::fmt::Debug for Error { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { Error::Syntax(ref err) => { let hr: String = core::iter::repeat('~').take(79).collect(); diff --git a/src/lib.rs b/src/lib.rs index 7c305eda44..191aa2e1a2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,146 +1,371 @@ /*! -This crate provides a library for parsing, compiling, and executing regular -expressions. Its syntax is similar to Perl-style regular expressions, but lacks -a few features like look around and backreferences. In exchange, all searches -execute in linear time with respect to the size of the regular expression and -search text. +This crate provides routines for searching strings for matches of a [regular +expression] (aka "regex"). The regex syntax supported by this crate is similar +to other regex engines, but it lacks several features that are not known how to +implement efficiently. This includes, but is not limited to, look-around and +backreferences. In exchange, all regex searches in this crate have worst case +`O(m * n)` time complexity, where `m` is proportional to the size of the regex +and `n` is proportional to the size of the string being searched. -This crate's documentation provides some simple examples, describes -[Unicode support](#unicode) and exhaustively lists the -[supported syntax](#syntax). +[regular expression]: https://en.wikipedia.org/wiki/Regular_expression -For more specific details on the API for regular expressions, please see the -documentation for the [`Regex`](struct.Regex.html) type. +If you just want API documentation, then skip to the [`Regex`] type. Otherwise, +here's a quick example showing one way of parsing the output of a grep-like +program: + +```rust +use regex::Regex; + +let re = Regex::new(r"(?m)^([^:]+):([0-9]+):(.+)$").unwrap(); +let hay = "\ +path/to/foo:54:Blue Harvest +path/to/bar:90:Something, Something, Something, Dark Side +path/to/baz:3:It's a Trap! +"; + +let mut results = vec![]; +for (_, [path, lineno, line]) in re.captures_iter(hay).map(|c| c.extract()) { + results.push((path, lineno.parse::<u64>()?, line)); +} +assert_eq!(results, vec![ + ("path/to/foo", 54, "Blue Harvest"), + ("path/to/bar", 90, "Something, Something, Something, Dark Side"), + ("path/to/baz", 3, "It's a Trap!"), +]); +# Ok::<(), Box<dyn std::error::Error>>(()) +``` + +# Overview + +The primary type in this crate is a [`Regex`]. Its most important methods are +as follows: + +* [`Regex::new`] compiles a regex using the default configuration. A +[`RegexBuilder`] permits setting a non-default configuration. (For example, +case insensitive matching, verbose mode and others.) +* [`Regex::is_match`] reports whether a match exists in a particular haystack. +* [`Regex::find`] reports the byte offsets of a match in a haystack, if one +exists. [`Regex::find_iter`] returns an iterator over all such matches. +* [`Regex::captures`] returns a [`Captures`], which reports both the byte +offsets of a match in a haystack and the byte offsets of each matching capture +group from the regex in the haystack. +[`Regex::captures_iter`] returns an iterator over all such matches. + +There is also a [`RegexSet`], which permits searching for multiple regex +patterns simultaneously in a single search. However, it currently only reports +which patterns match and *not* the byte offsets of a match. + +Otherwise, this top-level crate documentation is organized as follows: + +* [Usage](#usage) shows how to add the `regex` crate to your Rust project. +* [Examples](#examples) provides a limited selection of regex search examples. +* [Performance](#performance) provides a brief summary of how to optimize regex +searching speed. +* [Unicode](#unicode) discusses support for non-ASCII patterns. +* [Syntax](#syntax) enumerates the specific regex syntax supported by this +crate. +* [Untrusted input](#untrusted-input) discusses how this crate deals with regex +patterns or haystacks that are untrusted. +* [Crate features](#crate-features) documents the Cargo features that can be +enabled or disabled for this crate. +* [Other crates](#other-crates) links to other crates in the `regex` family. # Usage -This crate is [on crates.io](https://crates.io/crates/regex) and can be +The `regex` crate is [on crates.io](https://crates.io/crates/regex) and can be used by adding `regex` to your dependencies in your project's `Cargo.toml`. +Or more simply, just run `cargo add regex`. + +Here is a complete example that creates a new Rust project, adds a dependency +on `regex`, creates the source code for a regex search and then runs the +program. + +First, create the project in a new directory: + +```text +$ mkdir regex-example +$ cd regex-example +$ cargo init +``` + +Second, add a dependency on `regex`: + +```text +$ cargo add regex +``` + +Third, edit `src/main.rs`. Delete what's there and replace it with this: + +``` +use regex::Regex; + +fn main() { + let re = Regex::new(r"Hello (?<name>\w+)!").unwrap(); + let Some(caps) = re.captures("Hello Murphy!") else { + println!("no match!"); + return; + }; + println!("The name is: {}", &caps["name"]); +} +``` -```toml -[dependencies] -regex = "1" +Foruth, run it with `cargo run`: + +```text +$ cargo run + Compiling memchr v2.5.0 + Compiling regex-syntax v0.7.1 + Compiling aho-corasick v1.0.1 + Compiling regex v1.8.1 + Compiling regex-example v0.1.0 (/tmp/regex-example) + Finished dev [unoptimized + debuginfo] target(s) in 4.22s + Running `target/debug/regex-example` +The name is: Murphy ``` -# Example: find a date +The first time you run the program will show more output like above. But +subsequent runs shouldn't have to re-compile the dependencies. + +# Examples + +This section provides a few examples, in tutorial style, showing how to +search a haystack with a regex. There are more examples throughout the API +documentation. + +Before starting though, it's worth defining a few terms: -General use of regular expressions in this package involves compiling an -expression and then using it to search, split or replace text. For example, -to confirm that some text resembles a date: +* A **regex** is a Rust value whose type is `Regex`. We use `re` as a +variable name for a regex. +* A **pattern** is the string that is used to build a regex. We use `pat` as +a variable name for a pattern. +* A **haystack** is the string that is searched by a regex. We use `hay` as a +variable name for a haystack. + +Sometimes the words "regex" and "pattern" are used interchangeably. + +General use of regular expressions in this crate proceeds by compiling a +**pattern** into a **regex**, and then using that regex to search, split or +replace parts of a **haystack**. + +### Example: find a middle initial + +We'll start off with a very simple example: a regex that looks for a specific +name but uses a wildcard to match a middle initial. Our pattern serves as +something like a template that will match a particular name with *any* middle +initial. ```rust use regex::Regex; -let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); -assert!(re.is_match("2014-01-01")); + +// We use 'unwrap()' here because it would be a bug in our program if the +// pattern failed to compile to a regex. Panicking in the presence of a bug +// is okay. +let re = Regex::new(r"Homer (.)\. Simpson").unwrap(); +let hay = "Homer J. Simpson"; +let Some(caps) = re.captures(hay) else { return }; +assert_eq!("J", &caps[1]); ``` -Notice the use of the `^` and `$` anchors. In this crate, every expression -is executed with an implicit `.*?` at the beginning and end, which allows -it to match anywhere in the text. Anchors can be used to ensure that the -full text matches an expression. +There are a few things worth noticing here in our first example: -This example also demonstrates the utility of -[raw strings](https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals) -in Rust, which -are just like regular strings except they are prefixed with an `r` and do -not process any escape sequences. For example, `"\\d"` is the same -expression as `r"\d"`. +* The `.` is a special pattern meta character that means "match any single +character except for new lines." (More precisely, in this crate, it means +"match any UTF-8 encoding of any Unicode scalar value other than `\n`.") +* We can match an actual `.` literally by escaping it, i.e., `\.`. +* We use Rust's [raw strings] to avoid needing to deal with escape sequences in +both the regex pattern syntax and in Rust's string literal syntax. If we didn't +use raw strings here, we would have had to use `\\.` to match a literal `.` +character. That is, `r"\."` and `"\\."` are equivalent patterns. +* We put our wildcard `.` instruction in parentheses. These parentheses have a +special meaning that says, "make whatever part of the haystack matches within +these parentheses available as a capturing group." After finding a match, we +access this capture group with `&caps[1]`. -# Example: Avoid compiling the same regex in a loop +[raw strings]: https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals -It is an anti-pattern to compile the same regular expression in a loop -since compilation is typically expensive. (It takes anywhere from a few -microseconds to a few **milliseconds** depending on the size of the -regex.) Not only is compilation itself expensive, but this also prevents -optimizations that reuse allocations internally to the matching engines. +Otherwise, we execute a search using `re.captures(hay)` and return from our +function if no match occurred. We then reference the middle initial by asking +for the part of the haystack that matched the capture group indexed at `1`. +(The capture group at index 0 is implicit and always corresponds to the entire +match. In this case, that's `Homer J. Simpson`.) -In Rust, it can sometimes be a pain to pass regular expressions around if -they're used from inside a helper function. Instead, we recommend using the -[`lazy_static`](https://crates.io/crates/lazy_static) crate to ensure that -regular expressions are compiled exactly once. +### Example: named capture groups -For example: +Continuing from our middle initial example above, we can tweak the pattern +slightly to give a name to the group that matches the middle initial: ```rust -use lazy_static::lazy_static; use regex::Regex; -fn some_helper_function(text: &str) -> bool { - lazy_static! { - static ref RE: Regex = Regex::new("...").unwrap(); - } - RE.is_match(text) -} +// Note that (?P<middle>.) is a different way to spell the same thing. +let re = Regex::new(r"Homer (?<middle>.)\. Simpson").unwrap(); +let hay = "Homer J. Simpson"; +let Some(caps) = re.captures(hay) else { return }; +assert_eq!("J", &caps["middle"]); +``` + +Giving a name to a group can be useful when there are multiple groups in +a pattern. It makes the code referring to those groups a bit easier to +understand. + +### Example: validating a particular date format + +This examples shows how to confirm whether a haystack, in its entirety, matches +a particular date format: + +```rust +use regex::Regex; + +let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +assert!(re.is_match("2010-03-14")); +``` + +Notice the use of the `^` and `$` anchors. In this crate, every regex search is +run with an implicit `(?s:.)*?` at the beginning of its pattern, which allows +the regex to match anywhere in a haystack. Anchors, as above, can be used to +ensure that the full haystack matches a pattern. + +This crate is also Unicode aware by default, which means that `\d` might match +more than you might expect it to. For example: + +```rust +use regex::Regex; -fn main() {} +let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); +assert!(re.is_match("𝟚𝟘𝟙𝟘-𝟘𝟛-𝟙𝟜")); ``` -Specifically, in this example, the regex will be compiled when it is used for -the first time. On subsequent uses, it will reuse the previous compilation. +To only match an ASCII decimal digit, all of the following are equivalent: + +* `[0-9]` +* `(?-u:\d)` +* `[[:digit:]]` +* `[\d&&\p{ascii}]` -# Example: iterating over capture groups +### Example: finding dates in a haystack -This crate provides convenient iterators for matching an expression -repeatedly against a search string to find successive non-overlapping -matches. For example, to find all dates in a string and be able to access -them by their component pieces: +In the previous example, we showed how one might validate that a haystack, +in its entirety, corresponded to a particular date format. But what if we wanted +to extract all things that look like dates in a specific format from a haystack? +To do this, we can use an iterator API to find all matches (notice that we've +removed the anchors and switched to looking for ASCII-only digits): ```rust -# use regex::Regex; -# fn main() { -let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); -let text = "2012-03-14, 2013-01-01 and 2014-07-05"; -for cap in re.captures_iter(text) { - println!("Month: {} Day: {} Year: {}", &cap[2], &cap[3], &cap[1]); -} -// Output: -// Month: 03 Day: 14 Year: 2012 -// Month: 01 Day: 01 Year: 2013 -// Month: 07 Day: 05 Year: 2014 -# } +use regex::Regex; + +let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); +let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; +// 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack. +let dates: Vec<&str> = re.find_iter(hay).map(|m| m.as_str()).collect(); +assert_eq!(dates, vec![ + "1865-04-14", + "1881-07-02", + "1901-09-06", + "1963-11-22", +]); +``` + +We can also iterate over [`Captures`] values instead of [`Match`] values, and +that in turn permits accessing each component of the date via capturing groups: + +```rust +use regex::Regex; + +let re = Regex::new(r"(?<y>[0-9]{4})-(?<m>[0-9]{2})-(?<d>[0-9]{2})").unwrap(); +let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; +// 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack. +let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { + // The unwraps are okay because every capture group must match if the whole + // regex matches, and in this context, we know we have a match. + // + // Note that we use `caps.name("y").unwrap().as_str()` instead of + // `&caps["y"]` because the lifetime of the former is the same as the + // lifetime of `hay` above, but the lifetime of the latter is tied to the + // lifetime of `caps` due to how the `Index` trait is defined. + let year = caps.name("y").unwrap().as_str(); + let month = caps.name("m").unwrap().as_str(); + let day = caps.name("d").unwrap().as_str(); + (year, month, day) +}).collect(); +assert_eq!(dates, vec![ + ("1865", "04", "14"), + ("1881", "07", "02"), + ("1901", "09", "06"), + ("1963", "11", "22"), +]); +``` + +### Example: simpler capture group extraction + +One can use [`Captures::extract`] to make the code from the previous example a +bit simpler in this case: + +```rust +use regex::Regex; + +let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); +let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; +let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { + let (_, [year, month, day]) = caps.extract(); + (year, month, day) +}).collect(); +assert_eq!(dates, vec![ + ("1865", "04", "14"), + ("1881", "07", "02"), + ("1901", "09", "06"), + ("1963", "11", "22"), +]); ``` -Notice that the year is in the capture group indexed at `1`. This is -because the *entire match* is stored in the capture group at index `0`. +`Captures::extract` works by ensuring that the number of matching groups match +the number of groups requested via the `[year, month, day]` syntax. If they do, +then the substrings for each corresponding capture group are automatically +returned in an appropriately sized array. Rust's syntax for pattern matching +arrays does the rest. -# Example: replacement with named capture groups +### Example: replacement with named capture groups Building on the previous example, perhaps we'd like to rearrange the date -formats. This can be done with text replacement. But to make the code -clearer, we can *name* our capture groups and use those names as variables -in our replacement text: +formats. This can be done by finding each match and replacing it with +something different. The [`Regex::replace_all`] routine provides a convenient +way to do this, including by supporting references to named groups in the +replacement string: ```rust -# use regex::Regex; -# fn main() { -let re = Regex::new(r"(?P<y>\d{4})-(?P<m>\d{2})-(?P<d>\d{2})").unwrap(); -let before = "2012-03-14, 2013-01-01 and 2014-07-05"; +use regex::Regex; + +let re = Regex::new(r"(?<y>\d{4})-(?<m>\d{2})-(?<d>\d{2})").unwrap(); +let before = "1973-01-05, 1975-08-25 and 1980-10-18"; let after = re.replace_all(before, "$m/$d/$y"); -assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); -# } +assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980"); ``` -The `replace` methods are actually polymorphic in the replacement, which +The replace methods are actually polymorphic in the replacement, which provides more flexibility than is seen here. (See the documentation for -`Regex::replace` for more details.) +[`Regex::replace`] for more details.) -Note that if your regex gets complicated, you can use the `x` flag to -enable insignificant whitespace mode, which also lets you write comments: +### Example: verbose mode + +When your regex gets complicated, you might consider using something other +than regex. But if you stick with regex, you can use the `x` flag to enable +insignificant whitespace mode or "verbose mode." In this mode, whitespace +is treated as insignificant and one may write comments. This may make your +patterns easier to comprehend. ```rust -# use regex::Regex; -# fn main() { +use regex::Regex; + let re = Regex::new(r"(?x) - (?P<y>\d{4}) # the year + (?P<y>\d{4}) # the year, including all Unicode digits - - (?P<m>\d{2}) # the month + (?P<m>\d{2}) # the month, including all Unicode digits - - (?P<d>\d{2}) # the day + (?P<d>\d{2}) # the day, including all Unicode digits ").unwrap(); -let before = "2012-03-14, 2013-01-01 and 2014-07-05"; + +let before = "1973-01-05, 1975-08-25 and 1980-10-18"; let after = re.replace_all(before, "$m/$d/$y"); -assert_eq!(after, "03/14/2012, 01/01/2013 and 07/05/2014"); -# } +assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980"); ``` If you wish to match against whitespace in this mode, you can still use `\s`, @@ -148,10 +373,10 @@ If you wish to match against whitespace in this mode, you can still use `\s`, directly with `\ `, use its hex character code `\x20` or temporarily disable the `x` flag, e.g., `(?-x: )`. -# Example: match multiple regular expressions simultaneously +### Example: match multiple regular expressions simultaneously -This demonstrates how to use a `RegexSet` to match multiple (possibly -overlapping) regular expressions in a single scan of the search text: +This demonstrates how to use a [`RegexSet`] to match multiple (possibly +overlapping) regexes in a single scan of a haystack: ```rust use regex::RegexSet; @@ -166,7 +391,8 @@ let set = RegexSet::new(&[ r"foobar", ]).unwrap(); -// Iterate over and collect all of the matches. +// Iterate over and collect all of the matches. Each match corresponds to the +// ID of the matching pattern. let matches: Vec<_> = set.matches("foobar").into_iter().collect(); assert_eq!(matches, vec![0, 2, 3, 4, 6]); @@ -176,96 +402,223 @@ assert!(!matches.matched(5)); assert!(matches.matched(6)); ``` -# Pay for what you use +# Performance + +This section briefly discusses a few concerns regarding the speed and resource +usage of regexes. + +### Only ask for what you need -With respect to searching text with a regular expression, there are three -questions that can be asked: +When running a search with a regex, there are generally three different types +of information one can ask for: -1. Does the text match this expression? -2. If so, where does it match? -3. Where did the capturing groups match? +1. Does a regex match in a haystack? +2. Where does a regex match in a haystack? +3. Where do each of the capturing groups match in a haystack? Generally speaking, this crate could provide a function to answer only #3, which would subsume #1 and #2 automatically. However, it can be significantly more expensive to compute the location of capturing group matches, so it's best not to do it if you don't need to. -Therefore, only use what you need. For example, don't use `find` if you -only need to test if an expression matches a string. (Use `is_match` -instead.) +Therefore, only ask for what you need. For example, don't use [`Regex::find`] +if you only need to test if a regex matches a haystack. Use [`Regex::is_match`] +instead. + +### Unicode can impact memory usage and search speed + +This crate has first class support for Unicode and it is **enabled by default**. +In many cases, the extra memory required to support it will be negligible and +it typically won't impact search speed. But it can in some cases. + +With respect to memory usage, the impact of Unicode principally manifests +through the use of Unicode character classes. Unicode character classes +tend to be quite large. For example, `\w` by default matches around 140,000 +distinct codepoints. This requires additional memory, and tends to slow down +regex compilation. While a `\w` here and there is unlikely to be noticed, +writing `\w{100}` will for example result in quite a large regex by default. +Indeed, `\w` is considerably larger than its ASCII-only version, so if your +requirements are satisfied by ASCII, it's probably a good idea to stick to +ASCII classes. The ASCII-only version of `\w` can be spelled in a number of +ways. All of the following are equivalent: + +* `[0-9A-Za-z_]` +* `(?-u:\w)` +* `[[:word:]]` +* `[\w&&\p{ascii}]` + +With respect to search speed, Unicode tends to be handled pretty well, even when +using large Unicode character classes. However, some of the faster internal +regex engines cannot handle a Unicode aware word boundary assertion. So if you +don't need Unicode-aware word boundary assertions, you might consider using +`(?-u:\b)` instead of `\b`, where the former uses an ASCII-only definition of +a word character. + +### Literals might accelerate searches + +This crate tends to be quite good at recognizing literals in a regex pattern +and using them to accelerate a search. If it is at all possible to include +some kind of literal in your pattern, then it might make search substantially +faster. For example, in the regex `\w+@\w+`, the engine will look for +occurrences of `@` and then try a reverse match for `\w+` to find the start +position. + +### Avoid re-compiling regexes, especially in a loop + +It is an anti-pattern to compile the same pattern in a loop since regex +compilation is typically expensive. (It takes anywhere from a few microseconds +to a few **milliseconds** depending on the size of the pattern.) Not only is +compilation itself expensive, but this also prevents optimizations that reuse +allocations internally to the regex engine. + +In Rust, it can sometimes be a pain to pass regexes around if they're used from +inside a helper function. Instead, we recommend using crates like [`once_cell`] +and [`lazy_static`] to ensure that patterns are compiled exactly once. + +[`once_cell`]: https://crates.io/crates/once_cell +[`lazy_static`]: https://crates.io/crates/lazy_static + +This example shows how to use `once_cell`: -# Unicode +```rust +use { + once_cell::sync::Lazy, + regex::Regex, +}; -This implementation executes regular expressions **only** on valid UTF-8 -while exposing match locations as byte indices into the search string. (To -relax this restriction, use the [`bytes`](bytes/index.html) sub-module.) -Conceptually, the regex engine works by matching a haystack as if it were a -sequence of Unicode scalar values. +fn some_helper_function(haystack: &str) -> bool { + static RE: Lazy<Regex> = Lazy::new(|| Regex::new(r"...").unwrap()); + RE.is_match(haystack) +} -Only simple case folding is supported. Namely, when matching -case-insensitively, the characters are first mapped using the "simple" case -folding rules defined by Unicode. +fn main() { + assert!(some_helper_function("abc")); + assert!(!some_helper_function("ac")); +} +``` + +Specifically, in this example, the regex will be compiled when it is used for +the first time. On subsequent uses, it will reuse the previously built `Regex`. +Notice how one can define the `Regex` locally to a specific function. + +### Sharing a regex across threads can result in contention + +While a single `Regex` can be freely used from multiple threads simultaneously, +there is a small synchronization cost that must be paid. Generally speaking, +one shouldn't expect to observe this unless the principal task in each thread +is searching with the regex *and* most searches are on short haystacks. In this +case, internal contention on shared resources can spike and increase latency, +which in turn may slow down each individual search. + +One can work around this by cloning each `Regex` before sending it to another +thread. The cloned regexes will still share the same internal read-only portion +of its compiled state (it's reference counted), but each thread will get +optimized access to the mutable space that is used to run a search. In general, +there is no additional cost in memory to doing this. The only cost is the added +code complexity required to explicitly clone the regex. (If you share the same +`Regex` across multiple threads, each thread still gets its own mutable space, +but accessing that space is slower.) + +# Unicode -Regular expressions themselves are **only** interpreted as a sequence of -Unicode scalar values. This means you can use Unicode characters directly -in your expression: +This section discusses what kind of Unicode support this regex library has. +Before showing some examples, we'll summarize the relevant points: + +* This crate almost fully implements "Basic Unicode Support" (Level 1) as +specified by the [Unicode Technical Standard #18][UTS18]. The full details +of what is supported are documented in [UNICODE.md] in the root of the regex +crate repository. There is virtually no support for "Extended Unicode Support" +(Level 2) from UTS#18. +* The top-level [`Regex`] runs searches *as if* iterating over each of the +codepoints in the haystack. That is, the fundamental atom of matching is a +single codepoint. +* [`bytes::Regex`], in contrast, permits disabling Unicode mode for part of all +of your pattern in all cases. When Unicode mode is disabled, then a search is +run *as if* iterating over each byte in the haystack. That is, the fundamental +atom of matching is a single byte. (A top-level `Regex` also permits disabling +Unicode and thus matching *as if* it were one byte at a time, but only when +doing so wouldn't permit matching invalid UTF-8.) +* When Unicode mode is enabled (the default), `.` will match an entire Unicode +scalar value, even when it is encoded using multiple bytes. When Unicode mode +is disabled (e.g., `(?-u:.)`), then `.` will match a single byte in all cases. +* The character classes `\w`, `\d` and `\s` are all Unicode-aware by default. +Use `(?-u:\w)`, `(?-u:\d)` and `(?-u:\s)` to get their ASCII-only definitions. +* Similarly, `\b` and `\B` use a Unicode definition of a "word" character. To +get ASCII-only word boundaries, use `(?-u:\b)` and `(?-u:\B)`. +* `^` and `$` are **not** Unicode-aware in multi-line mode. Namely, they only +recognize `\n` (assuming CRLF mode is not enabled) and not any of the other +forms of line terminators defined by Unicode. +* Case insensitive searching is Unicode-aware and uses simple case folding. +* Unicode general categories, scripts and many boolean properties are available +by default via the `\p{property name}` syntax. +* In all cases, matches are reported using byte offsets. Or more precisely, +UTF-8 code unit offsets. This permits constant time indexing and slicing of the +haystack. + +[UTS18]: https://unicode.org/reports/tr18/ +[UNICODE.md]: https://github.com/rust-lang/regex/blob/master/UNICODE.md + +Patterns themselves are **only** interpreted as a sequence of Unicode scalar +values. This means you can use Unicode characters directly in your pattern: ```rust -# use regex::Regex; -# fn main() { +use regex::Regex; + let re = Regex::new(r"(?i)Δ+").unwrap(); -let mat = re.find("ΔδΔ").unwrap(); -assert_eq!((mat.start(), mat.end()), (0, 6)); -# } +let m = re.find("ΔδΔ").unwrap(); +assert_eq!((0, 6), (m.start(), m.end())); +// alternatively: +assert_eq!(0..6, m.range()); ``` -Most features of the regular expressions in this crate are Unicode aware. Here -are some examples: - -* `.` will match any valid UTF-8 encoded Unicode scalar value except for `\n`. - (To also match `\n`, enable the `s` flag, e.g., `(?s:.)`.) -* `\w`, `\d` and `\s` are Unicode aware. For example, `\s` will match all forms - of whitespace categorized by Unicode. -* `\b` matches a Unicode word boundary. -* Negated character classes like `[^a]` match all Unicode scalar values except - for `a`. -* `^` and `$` are **not** Unicode aware in multi-line mode. Namely, they only - recognize `\n` and not any of the other forms of line terminators defined - by Unicode. - -Unicode general categories, scripts, script extensions, ages and a smattering -of boolean properties are available as character classes. For example, you can -match a sequence of numerals, Greek or Cherokee letters: +As noted above, Unicode general categories, scripts, script extensions, ages +and a smattering of boolean properties are available as character classes. For +example, you can match a sequence of numerals, Greek or Cherokee letters: ```rust -# use regex::Regex; -# fn main() { +use regex::Regex; + let re = Regex::new(r"[\pN\p{Greek}\p{Cherokee}]+").unwrap(); -let mat = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); -assert_eq!((mat.start(), mat.end()), (3, 23)); -# } +let m = re.find("abcΔᎠβⅠᏴγδⅡxyz").unwrap(); +assert_eq!(3..23, m.range()); ``` -For a more detailed breakdown of Unicode support with respect to -[UTS#18](https://unicode.org/reports/tr18/), -please see the -[UNICODE](https://github.com/rust-lang/regex/blob/master/UNICODE.md) -document in the root of the regex repository. +While not specific to Unicode, this library also supports character class set +operations. Namely, one can nest character classes arbitrarily and perform set +operations on them. Those set operations are union (the default), intersection, +difference and symmetric difference. These set operations tend to be most +useful with Unicode character classes. For example, to match any codepoint +that is both in the `Greek` script and in the `Letter` general category: + +```rust +use regex::Regex; + +let re = Regex::new(r"[\p{Greek}&&\pL]+").unwrap(); +let subs: Vec<&str> = re.find_iter("ΔδΔ𐅌ΔδΔ").map(|m| m.as_str()).collect(); +assert_eq!(subs, vec!["ΔδΔ", "ΔδΔ"]); -# Opt out of Unicode support +// If we just matches on Greek, then all codepoints would match! +let re = Regex::new(r"\p{Greek}+").unwrap(); +let subs: Vec<&str> = re.find_iter("ΔδΔ𐅌ΔδΔ").map(|m| m.as_str()).collect(); +assert_eq!(subs, vec!["ΔδΔ𐅌ΔδΔ"]); +``` + +### Opt out of Unicode support -The `bytes` sub-module provides a `Regex` type that can be used to match -on `&[u8]`. By default, text is interpreted as UTF-8 just like it is with -the main `Regex` type. However, this behavior can be disabled by turning -off the `u` flag, even if doing so could result in matching invalid UTF-8. -For example, when the `u` flag is disabled, `.` will match any byte instead -of any Unicode scalar value. +The [`bytes::Regex`] type that can be used to search `&[u8]` haystacks. By +default, haystacks are conventionally treated as UTF-8 just like it is with the +main `Regex` type. However, this behavior can be disabled by turning off the +`u` flag, even if doing so could result in matching invalid UTF-8. For example, +when the `u` flag is disabled, `.` will match any byte instead of any Unicode +scalar value. Disabling the `u` flag is also possible with the standard `&str`-based `Regex` type, but it is only allowed where the UTF-8 invariant is maintained. For example, `(?-u:\w)` is an ASCII-only `\w` character class and is legal in an -`&str`-based `Regex`, but `(?-u:\xFF)` will attempt to match the raw byte -`\xFF`, which is invalid UTF-8 and therefore is illegal in `&str`-based +`&str`-based `Regex`, but `(?-u:\W)` will attempt to match *any byte* that +isn't in `(?-u:\w)`, which in turn includes bytes that are invalid UTF-8. +Similarly, `(?-u:\xFF)` will attempt to match the raw byte `\xFF` (instead of +`U+00FF`), which is invalid UTF-8 and therefore is illegal in `&str`-based regexes. Finally, since Unicode support requires bundling large Unicode data @@ -281,10 +634,11 @@ The syntax supported in this crate is documented below. Note that the regular expression parser and abstract syntax are exposed in a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). -## Matching one character +### Matching one character <pre class="rust"> . any character except new line (includes new line with s flag) +[0-9] any ASCII digit \d digit (\p{Nd}) \D not digit \pX Unicode character class identified by a one-letter name @@ -307,6 +661,7 @@ a separate crate, [`regex-syntax`](https://docs.rs/regex-syntax). [0-9--4] Direct subtraction (matching 0-9 except 4) [a-g~~b-h] Symmetric difference (matching `a` and `h` only) [\[\]] Escaping in character classes (matching [ or ]) +[a&&b] An empty character class matching nothing </pre> Any named character class may appear inside a bracketed `[...]` character @@ -315,12 +670,14 @@ digit. `[\p{Greek}&&\pL]` matches Greek letters. Precedence in character classes, from most binding to least: -1. Ranges: `a-cd` == `[a-c]d` -2. Union: `ab&&bc` == `[ab]&&[bc]` -3. Intersection: `^a-z&&b` == `^[a-z&&b]` -4. Negation +1. Ranges: `[a-cd]` == `[[a-c]d]` +2. Union: `[ab&&bc]` == `[[ab]&&[bc]]` +3. Intersection, difference, symmetric difference. All three have equivalent +precedence, and are evaluated in left-to-right order. For example, +`[\pL--\p{Greek}&&\p{Uppercase}]` == `[[\pL--\p{Greek}]&&\p{Uppercase}]`. +4. Negation: `[^a-z&&b]` == `[^[a-z&&b]]`. -## Composites +### Composites <pre class="rust"> xy concatenation (x followed by y) @@ -346,7 +703,7 @@ let re = Regex::new(r"sam|samwise").unwrap(); assert_eq!("sam", re.find(haystack).unwrap().as_str()); ``` -## Repetitions +### Repetitions <pre class="rust"> x* zero or more of x (greedy) @@ -363,21 +720,40 @@ x{n,}? at least n x (ungreedy/lazy) x{n}? exactly n x </pre> -## Empty matches +### Empty matches <pre class="rust"> -^ the beginning of text (or start-of-line with multi-line mode) -$ the end of text (or end-of-line with multi-line mode) -\A only the beginning of text (even with multi-line mode enabled) -\z only the end of text (even with multi-line mode enabled) +^ the beginning of a haystack (or start-of-line with multi-line mode) +$ the end of a haystack (or end-of-line with multi-line mode) +\A only the beginning of a haystack (even with multi-line mode enabled) +\z only the end of a haystack (even with multi-line mode enabled) \b a Unicode word boundary (\w on one side and \W, \A, or \z on other) \B not a Unicode word boundary </pre> -The empty regex is valid and matches the empty string. For example, the empty -regex matches `abc` at positions `0`, `1`, `2` and `3`. +The empty regex is valid and matches the empty string. For example, the +empty regex matches `abc` at positions `0`, `1`, `2` and `3`. When using the +top-level [`Regex`] on `&str` haystacks, an empty match that splits a codepoint +is guaranteed to never be returned. However, such matches are permitted when +using a [`bytes::Regex`]. For example: + +```rust +let re = regex::Regex::new(r"").unwrap(); +let ranges: Vec<_> = re.find_iter("💩").map(|m| m.range()).collect(); +assert_eq!(ranges, vec![0..0, 4..4]); + +let re = regex::bytes::Regex::new(r"").unwrap(); +let ranges: Vec<_> = re.find_iter("💩".as_bytes()).map(|m| m.range()).collect(); +assert_eq!(ranges, vec![0..0, 1..1, 2..2, 3..3, 4..4]); +``` + +Note that an empty regex is distinct from a regex that can never match. +For example, the regex `[a&&b]` is a character class that represents the +intersection of `a` and `b`. That intersection is empty, which means the +character class is empty. Since nothing is in the empty set, `[a&&b]` matches +nothing, not even the empty string. -## Grouping and flags +### Grouping and flags <pre class="rust"> (exp) numbered capture group (indexed by opening parenthesis) @@ -405,6 +781,7 @@ All flags are by default disabled unless stated otherwise. They are: i case-insensitive: letters match both upper and lower case m multi-line mode: ^ and $ match begin/end of line s allow . to match \n +R enables CRLF mode: when multi-line mode is enabled, \r\n is used U swap the meaning of x* and x*? u Unicode support (enabled by default) x verbose mode, ignores whitespace and allow line comments (starting with `#`) @@ -418,22 +795,22 @@ Flags can be toggled within a pattern. Here's an example that matches case-insensitively for the first part but case-sensitively for the second part: ```rust -# use regex::Regex; -# fn main() { +use regex::Regex; + let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); -let cap = re.captures("AaAaAbbBBBb").unwrap(); -assert_eq!(&cap[0], "AaAaAbb"); -# } +let m = re.find("AaAaAbbBBBb").unwrap(); +assert_eq!(m.as_str(), "AaAaAbb"); ``` Notice that the `a+` matches either `a` or `A`, but the `b+` only matches `b`. Multi-line mode means `^` and `$` no longer match just at the beginning/end of -the input, but at the beginning/end of lines: +the input, but also at the beginning/end of lines: ``` -# use regex::Regex; +use regex::Regex; + let re = Regex::new(r"(?m)^line \d+").unwrap(); let m = re.find("line one\nline 2\n").unwrap(); assert_eq!(m.as_str(), "line 2"); @@ -442,44 +819,68 @@ assert_eq!(m.as_str(), "line 2"); Note that `^` matches after new lines, even at the end of input: ``` -# use regex::Regex; +use regex::Regex; + let re = Regex::new(r"(?m)^").unwrap(); let m = re.find_iter("test\n").last().unwrap(); assert_eq!((m.start(), m.end()), (5, 5)); ``` -Here is an example that uses an ASCII word boundary instead of a Unicode -word boundary: +When both CRLF mode and multi-line mode are enabled, then `^` and `$` will +match either `\r` and `\n`, but never in the middle of a `\r\n`: + +``` +use regex::Regex; + +let re = Regex::new(r"(?mR)^foo$").unwrap(); +let m = re.find("\r\nfoo\r\n").unwrap(); +assert_eq!(m.as_str(), "foo"); +``` + +Unicode mode can also be selectively disabled, although only when the result +*would not* match invalid UTF-8. One good example of this is using an ASCII +word boundary instead of a Unicode word boundary, which might make some regex +searches run faster: ```rust -# use regex::Regex; -# fn main() { +use regex::Regex; + let re = Regex::new(r"(?-u:\b).+(?-u:\b)").unwrap(); -let cap = re.captures("$$abc$$").unwrap(); -assert_eq!(&cap[0], "abc"); -# } +let m = re.find("$$abc$$").unwrap(); +assert_eq!(m.as_str(), "abc"); ``` -## Escape sequences +### Escape sequences + +Note that this includes all possible escape sequences, even ones that are +documented elsewhere. <pre class="rust"> -\* literal *, works for any punctuation character: \.+*?()|[]{}^$ +\* literal *, applies to all ASCII except [0-9A-Za-z<>] \a bell (\x07) \f form feed (\x0C) \t horizontal tab \n new line \r carriage return \v vertical tab (\x0B) -\123 octal character code (up to three digits) (when enabled) +\A matches at the beginning of a haystack +\z matches at the end of a haystack +\b word boundary assertion +\B negated word boundary assertion +\123 octal character code, up to three digits (when enabled) \x7F hex character code (exactly two digits) \x{10FFFF} any hex character code corresponding to a Unicode code point \u007F hex character code (exactly four digits) \u{7F} any hex character code corresponding to a Unicode code point \U0000007F hex character code (exactly eight digits) \U{7F} any hex character code corresponding to a Unicode code point +\p{Letter} Unicode character class +\P{Letter} negated Unicode character class +\d, \s, \w Perl character class +\D, \S, \W negated Perl character class </pre> -## Perl character classes (Unicode friendly) +### Perl character classes (Unicode friendly) These classes are based on the definitions provided in [UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties): @@ -493,7 +894,10 @@ These classes are based on the definitions provided in \W not word character </pre> -## ASCII character classes +### ASCII character classes + +These classes are based on the definitions provided in +[UTS#18](https://www.unicode.org/reports/tr18/#Compatibility_Properties): <pre class="rust"> [[:alnum:]] alphanumeric ([0-9A-Za-z]) @@ -512,16 +916,228 @@ These classes are based on the definitions provided in [[:xdigit:]] hex digit ([0-9A-Fa-f]) </pre> +# Untrusted input + +This crate is meant to be able to run regex searches on untrusted haystacks +without fear of [ReDoS]. This crate also, to a certain extent, supports +untrusted patterns. + +[ReDoS]: https://en.wikipedia.org/wiki/ReDoS + +This crate differs from most (but not all) other regex engines in that it +doesn't use unbounded backtracking to run a regex search. In those cases, +one generally cannot use untrusted patterns *or* untrusted haystacks because +it can be very difficult to know whether a particular pattern will result in +catastrophic backtracking or not. + +We'll first discuss how this crate deals with untrusted inputs and then wrap +it up with a realistic discussion about what practice really looks like. + +### Panics + +Outside of clearly documented cases, most APIs in this crate are intended to +never panic regardless of the inputs given to them. For example, `Regex::new`, +`Regex::is_match`, `Regex::find` and `Regex::captures` should never panic. That +is, it is an API promise that those APIs will never panic no matter what inputs +are given to them. With that said, regex engines are complicated beasts, and +providing a rock solid guarantee that these APIs literally never panic is +essentially equivalent to saying, "there are no bugs in this library." That is +a bold claim, and not really one that can be feasibly made with a straight +face. + +Don't get the wrong impression here. This crate is extensively tested, not just +with unit and integration tests, but also via fuzz testing. For example, this +crate is part of the [OSS-fuzz project]. Panics should be incredibly rare, but +it is possible for bugs to exist, and thus possible for a panic to occur. If +you need a rock solid guarantee against panics, then you should wrap calls into +this library with [`std::panic::catch_unwind`]. + +It's also worth pointing out that this library will *generally* panic when +other regex engines would commit undefined behavior. When undefined behavior +occurs, your program might continue as if nothing bad has happened, but it also +might mean your program is open to the worst kinds of exploits. In contrast, +the worst thing a panic can do is a denial of service. + +[OSS-fuzz project]: https://android.googlesource.com/platform/external/oss-fuzz/+/refs/tags/android-t-preview-1/projects/rust-regex/ +[`std::panic::catch_unwind`]: https://doc.rust-lang.org/std/panic/fn.catch_unwind.html + +### Untrusted patterns + +The principal way this crate deals with them is by limiting their size by +default. The size limit can be configured via [`RegexBuilder::size_limit`]. The +idea of a size limit is that compiling a pattern into a `Regex` will fail if it +becomes "too big." Namely, while *most* resources consumed by compiling a regex +are approximately proportional (albeit with some high constant factors in some +cases, such as with Unicode character classes) to the length of the pattern +itself, there is one particular exception to this: counted repetitions. Namely, +this pattern: + +```text +a{5}{5}{5}{5}{5}{5} +``` + +Is equivalent to this pattern: + +```text +a{15625} +``` + +In both of these cases, the actual pattern string is quite small, but the +resulting `Regex` value is quite large. Indeed, as the first pattern shows, +it isn't enough to locally limit the size of each repetition because they can +be stacked in a way that results in exponential growth. + +To provide a bit more context, a simplified view of regex compilation looks +like this: + +* The pattern string is parsed into a structured representation called an AST. +Counted repetitions are not expanded and Unicode character classes are not +looked up in this stage. That is, the size of the AST is proportional to the +size of the pattern with "reasonable" constant factors. In other words, one +can reasonably limit the memory used by an AST by limiting the length of the +pattern string. +* The AST is translated into an HIR. Counted repetitions are still *not* +expanded at this stage, but Unicode character classes are embedded into the +HIR. The memory usage of a HIR is still proportional to the length of the +original pattern string, but the constant factors---mostly as a result of +Unicode character classes---can be quite high. Still though, the memory used by +an HIR can be reasonably limited by limiting the length of the pattern string. +* The HIR is compiled into a [Thompson NFA]. This is the stage at which +something like `\w{5}` is rewritten to `\w\w\w\w\w`. Thus, this is the stage +at which [`RegexBuilder::size_limit`] is enforced. If the NFA exceeds the +configured size, then this stage will fail. + +[Thompson NFA]: https://en.wikipedia.org/wiki/Thompson%27s_construction + +The size limit helps avoid two different kinds of exorbitant resource usage: + +* It avoids permitting exponential memory usage based on the size of the +pattern string. +* It avoids long search times. This will be discussed in more detail in the +next section, but worst case search time *is* dependent on the size of the +regex. So keeping regexes limited to a reasonable size is also a way of keeping +search times reasonable. + +Finally, it's worth pointing out that regex compilation is guaranteed to take +worst case `O(m)` time, where `m` is proportional to the size of regex. The +size of the regex here is *after* the counted repetitions have been expanded. + +**Advice for those using untrusted regexes**: limit the pattern length to +something small and expand it as needed. Configure [`RegexBuilder::size_limit`] +to something small and then expand it as needed. + +### Untrusted haystacks + +The main way this crate guards against searches from taking a long time is by +using algorithms that guarantee a `O(m * n)` worst case time and space bound. +Namely: + +* `m` is proportional to the size of the regex, where the size of the regex +includes the expansion of all counted repetitions. (See the previous section on +untrusted patterns.) +* `n` is proportional to the length, in bytes, of the haystack. + +In other words, if you consider `m` to be a constant (for example, the regex +pattern is a literal in the source code), then the search can be said to run +in "linear time." Or equivalently, "linear time with respect to the size of the +haystack." + +But the `m` factor here is important not to ignore. If a regex is +particularly big, the search times can get quite slow. This is why, in part, +[`RegexBuilder::size_limit`] exists. + +**Advice for those searching untrusted haystacks**: As long as your regexes +are not enormous, you should expect to be able to search untrusted haystacks +without fear. If you aren't sure, you should benchmark it. Unlike backtracking +engines, if your regex is so big that it's likely to result in slow searches, +this is probably something you'll be able to observe regardless of what the +haystack is made up of. + +### Iterating over matches + +One thing that is perhaps easy to miss is that the worst case time +complexity bound of `O(m * n)` applies to methods like [`Regex::is_match`], +[`Regex::find`] and [`Regex::captures`]. It does **not** apply to +[`Regex::find_iter`] or [`Regex::captures_iter`]. Namely, since iterating over +all matches can execute many searches, and each search can scan the entire +haystack, the worst case time complexity for iterators is `O(m * n^2)`. + +One example of where this occurs is when a pattern consists of an alternation, +where an earlier branch of the alternation requires scanning the entire +haystack only to discover that there is no match. It also requires a later +branch of the alternation to have matched at the beginning of the search. For +example, consider the pattern `.*[^A-Z]|[A-Z]` and the haystack `AAAAA`. The +first search will scan to the end looking for matches of `.*[^A-Z]` even though +a finite automata engine (as in this crate) knows that `[A-Z]` has already +matched the first character of the haystack. This is due to the greedy nature +of regex searching. That first search will report a match at the first `A` only +after scanning to the end to discover that no other match exists. The next +search then begins at the second `A` and the behavior repeats. + +There is no way to avoid this. This means that if both patterns and haystacks +are untrusted and you're iterating over all matches, you're susceptible to +worst case quadratic time complexity. One possible way to mitigate this +is to drop down to the lower level `regex-automata` crate and use its +`meta::Regex` iterator APIs. There, you can configure the search to operate +in "earliest" mode by passing a `Input::new(haystack).earliest(true)` to +`meta::Regex::find_iter` (for example). By enabling this mode, you give up +the normal greedy match semantics of regex searches and instead ask the regex +engine to immediately stop as soon as a match has been found. Enabling this +mode will thus restore the worst case `O(m * n)` time complexity bound, but at +the cost of different semantics. + +### Untrusted inputs in practice + +While providing a `O(m * n)` worst case time bound on all searches goes a long +way toward preventing [ReDoS], that doesn't mean every search you can possibly +run will complete without burning CPU time. In general, there are a few ways +for the `m * n` time bound to still bite you: + +* You are searching an exceptionally long haystack. No matter how you slice +it, a longer haystack will take more time to search. This crate may often make +very quick work of even long haystacks because of its literal optimizations, +but those aren't available for all regexes. +* Unicode character classes can cause searches to be quite slow in some cases. +This is especially true when they are combined with counted repetitions. While +the regex size limit above will protect you from the most egregious cases, +the default size limit still permits pretty big regexes that can execute more +slowly than one might expect. +* While routines like [`Regex::find`] and [`Regex::captures`] guarantee +worst case `O(m * n)` search time, routines like [`Regex::find_iter`] and +[`Regex::captures_iter`] actually have worst case `O(m * n^2)` search time. +This is because `find_iter` runs many searches, and each search takes worst +case `O(m * n)` time. Thus, iteration of all matches in a haystack has +worst case `O(m * n^2)`. A good example of a pattern that exhibits this is +`(?:A+){1000}|` or even `.*[^A-Z]|[A-Z]`. + +In general, unstrusted haystacks are easier to stomach than untrusted patterns. +Untrusted patterns give a lot more control to the caller to impact the +performance of a search. In many cases, a regex search will actually execute in +average case `O(n)` time (i.e., not dependent on the size of the regex), but +this can't be guaranteed in general. Therefore, permitting untrusted patterns +means that your only line of defense is to put a limit on how big `m` (and +perhaps also `n`) can be in `O(m * n)`. `n` is limited by simply inspecting +the length of the haystack while `m` is limited by *both* applying a limit to +the length of the pattern *and* a limit on the compiled size of the regex via +[`RegexBuilder::size_limit`]. + +It bears repeating: if you're accepting untrusted patterns, it would be a good +idea to start with conservative limits on `m` and `n`, and then carefully +increase them as needed. + # Crate features By default, this crate tries pretty hard to make regex matching both as fast -as possible and as correct as it can be, within reason. This means that there -is a lot of code dedicated to performance, the handling of Unicode data and the -Unicode data itself. Overall, this leads to more dependencies, larger binaries -and longer compile times. This trade off may not be appropriate in all cases, -and indeed, even when all Unicode and performance features are disabled, one -is still left with a perfectly serviceable regex engine that will work well -in many cases. +as possible and as correct as it can be. This means that there is a lot of +code dedicated to performance, the handling of Unicode data and the Unicode +data itself. Overall, this leads to more dependencies, larger binaries and +longer compile times. This trade off may not be appropriate in all cases, and +indeed, even when all Unicode and performance features are disabled, one is +still left with a perfectly serviceable regex engine that will work well in +many cases. (Note that code is not arbitrarily reducible, and for this reason, +the [`regex-lite`](https://docs.rs/regex-lite) crate exists to provide an even +more minimal experience by cutting out Unicode and performance, but still +maintaining the linear search time bound.) This crate exposes a number of features for controlling that trade off. Some of these features are strictly performance oriented, such that disabling them @@ -530,32 +1146,61 @@ Other features, such as the ones controlling the presence or absence of Unicode data, can result in a loss of functionality. For example, if one disables the `unicode-case` feature (described below), then compiling the regex `(?i)a` will fail since Unicode case insensitivity is enabled by default. Instead, -callers must use `(?i-u)a` instead to disable Unicode case folding. Stated -differently, enabling or disabling any of the features below can only add or -subtract from the total set of valid regular expressions. Enabling or disabling -a feature will never modify the match semantics of a regular expression. +callers must use `(?i-u)a` to disable Unicode case folding. Stated differently, +enabling or disabling any of the features below can only add or subtract from +the total set of valid regular expressions. Enabling or disabling a feature +will never modify the match semantics of a regular expression. -All features below are enabled by default. +Most features below are enabled by default. Features that aren't enabled by +default are noted. ### Ecosystem features * **std** - - When enabled, this will cause `regex` to use the standard library. Currently, - disabling this feature will always result in a compilation error. It is - intended to add `alloc`-only support to regex in the future. + When enabled, this will cause `regex` to use the standard library. In terms + of APIs, `std` causes error types to implement the `std::error::Error` + trait. Enabling `std` will also result in performance optimizations, + including SIMD and faster synchronization primitives. Notably, **disabling + the `std` feature will result in the use of spin locks**. To use a regex + engine without `std` and without spin locks, you'll need to drop down to + the [`regex-automata`](https://docs.rs/regex-automata) crate. +* **logging** - + When enabled, the `log` crate is used to emit messages about regex + compilation and search strategies. This is **disabled by default**. This is + typically only useful to someone working on this crate's internals, but might + be useful if you're doing some rabbit hole performance hacking. Or if you're + just interested in the kinds of decisions being made by the regex engine. ### Performance features * **perf** - - Enables all performance related features. This feature is enabled by default - and will always cover all features that improve performance, even if more - are added in the future. + Enables all performance related features except for `perf-dfa-full`. This + feature is enabled by default is intended to cover all reasonable features + that improve performance, even if more are added in the future. * **perf-dfa** - Enables the use of a lazy DFA for matching. The lazy DFA is used to compile portions of a regex to a very fast DFA on an as-needed basis. This can result in substantial speedups, usually by an order of magnitude on large haystacks. The lazy DFA does not bring in any new dependencies, but it can make compile times longer. +* **perf-dfa-full** - + Enables the use of a full DFA for matching. Full DFAs are problematic because + they have worst case `O(2^n)` construction time. For this reason, when this + feature is enabled, full DFAs are only used for very small regexes and a + very small space bound is used during determinization to avoid the DFA + from blowing up. This feature is not enabled by default, even as part of + `perf`, because it results in fairly sizeable increases in binary size and + compilation time. It can result in faster search times, but they tend to be + more modest and limited to non-Unicode regexes. +* **perf-onepass** - + Enables the use of a one-pass DFA for extracting the positions of capture + groups. This optimization applies to a subset of certain types of NFAs and + represents the fastest engine in this crate for dealing with capture groups. +* **perf-backtrack** - + Enables the use of a bounded backtracking algorithm for extracting the + positions of capture groups. This usually sits between the slowest engine + (the PikeVM) and the fastest engine (one-pass DFA) for extracting capture + groups. It's used whenever the regex is not one-pass and is small enough. * **perf-inline** - Enables the use of aggressive inlining inside match routines. This reduces the overhead of each match. The aggressive inlining, however, increases @@ -609,163 +1254,83 @@ All features below are enabled by default. This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and `\p{sb=ATerm}`. - -# Untrusted input - -This crate can handle both untrusted regular expressions and untrusted -search text. - -Untrusted regular expressions are handled by capping the size of a compiled -regular expression. -(See [`RegexBuilder::size_limit`](struct.RegexBuilder.html#method.size_limit).) -Without this, it would be trivial for an attacker to exhaust your system's -memory with expressions like `a{100}{100}{100}`. - -Untrusted search text is allowed because the matching engine(s) in this -crate have time complexity `O(mn)` (with `m ~ regex` and `n ~ search -text`), which means there's no way to cause exponential blow-up like with -some other regular expression engines. (We pay for this by disallowing -features like arbitrary look-ahead and backreferences.) - -When a DFA is used, pathological cases with exponential state blow-up are -avoided by constructing the DFA lazily or in an "online" manner. Therefore, -at most one new state can be created for each byte of input. This satisfies -our time complexity guarantees, but can lead to memory growth -proportional to the size of the input. As a stopgap, the DFA is only -allowed to store a fixed number of states. When the limit is reached, its -states are wiped and continues on, possibly duplicating previous work. If -the limit is reached too frequently, it gives up and hands control off to -another matching engine with fixed memory requirements. -(The DFA size limit can also be tweaked. See -[`RegexBuilder::dfa_size_limit`](struct.RegexBuilder.html#method.dfa_size_limit).) +# Other crates + +This crate has two required dependencies and several optional dependencies. +This section briefly describes them with the goal of raising awareness of how +different components of this crate may be used independently. + +It is somewhat unusual for a regex engine to have dependencies, as most regex +libraries are self contained units with no dependencies other than a particular +environment's standard library. Indeed, for other similarly optimized regex +engines, most or all of the code in the dependencies of this crate would +normally just be unseparable or coupled parts of the crate itself. But since +Rust and its tooling ecosystem make the use of dependencies so easy, it made +sense to spend some effort de-coupling parts of this crate and making them +independently useful. + +We only briefly describe each crate here. + +* [`regex-lite`](https://docs.rs/regex-lite) is not a dependency of `regex`, +but rather, a standalone zero-dependency simpler version of `regex` that +prioritizes compile times and binary size. In exchange, it eschews Unicode +support and performance. Its match semantics are as identical as possible to +the `regex` crate, and for the things it supports, its APIs are identical to +the APIs in this crate. In other words, for a lot of use cases, it is a drop-in +replacement. +* [`regex-syntax`](https://docs.rs/regex-syntax) provides a regular expression +parser via `Ast` and `Hir` types. It also provides routines for extracting +literals from a pattern. Folks can use this crate to do analysis, or even to +build their own regex engine without having to worry about writing a parser. +* [`regex-automata`](https://docs.rs/regex-automata) provides the regex engines +themselves. One of the downsides of finite automata based regex engines is that +they often need multiple internal engines in order to have similar or better +performance than an unbounded backtracking engine in practice. `regex-automata` +in particular provides public APIs for a PikeVM, a bounded backtracker, a +one-pass DFA, a lazy DFA, a fully compiled DFA and a meta regex engine that +combines all them together. It also has native multi-pattern support and +provides a way to compile and serialize full DFAs such that they can be loaded +and searched in a no-std no-alloc environment. `regex-automata` itself doesn't +even have a required dependency on `regex-syntax`! +* [`memchr`](https://docs.rs/memchr) provides low level SIMD vectorized +routines for quickly finding the location of single bytes or even substrings +in a haystack. In other words, it provides fast `memchr` and `memmem` routines. +These are used by this crate in literal optimizations. +* [`aho-corasick`](https://docs.rs/aho-corasick) provides multi-substring +search. It also provides SIMD vectorized routines in the case where the number +of substrings to search for is relatively small. The `regex` crate also uses +this for literal optimizations. */ +#![no_std] #![deny(missing_docs)] #![cfg_attr(feature = "pattern", feature(pattern))] #![warn(missing_debug_implementations)] -#[cfg(not(feature = "std"))] -compile_error!("`std` feature is currently required to build this crate"); - -// To check README's examples. #[cfg(doctest)] doc_comment::doctest!("../README.md"); -#[cfg(feature = "std")] -pub use crate::error::Error; -#[cfg(feature = "std")] -pub use crate::re_builder::set_unicode::*; -#[cfg(feature = "std")] -pub use crate::re_builder::unicode::*; -#[cfg(feature = "std")] -pub use crate::re_set::unicode::*; -#[cfg(feature = "std")] -pub use crate::re_unicode::{ - escape, CaptureLocations, CaptureMatches, CaptureNames, Captures, - Locations, Match, Matches, NoExpand, Regex, Replacer, ReplacerRef, Split, - SplitN, SubCaptureMatches, -}; - -/** -Match regular expressions on arbitrary bytes. +extern crate alloc; +#[cfg(any(test, feature = "std"))] +extern crate std; -This module provides a nearly identical API to the one found in the -top-level of this crate. There are two important differences: - -1. Matching is done on `&[u8]` instead of `&str`. Additionally, `Vec<u8>` -is used where `String` would have been used. -2. Unicode support can be disabled even when disabling it would result in -matching invalid UTF-8 bytes. - -# Example: match null terminated string - -This shows how to find all null-terminated strings in a slice of bytes: - -```rust -# use regex::bytes::Regex; -let re = Regex::new(r"(?-u)(?P<cstr>[^\x00]+)\x00").unwrap(); -let text = b"foo\x00bar\x00baz\x00"; - -// Extract all of the strings without the null terminator from each match. -// The unwrap is OK here since a match requires the `cstr` capture to match. -let cstrs: Vec<&[u8]> = - re.captures_iter(text) - .map(|c| c.name("cstr").unwrap().as_bytes()) - .collect(); -assert_eq!(vec![&b"foo"[..], &b"bar"[..], &b"baz"[..]], cstrs); -``` - -# Example: selectively enable Unicode support - -This shows how to match an arbitrary byte pattern followed by a UTF-8 encoded -string (e.g., to extract a title from a Matroska file): - -```rust -# use std::str; -# use regex::bytes::Regex; -let re = Regex::new( - r"(?-u)\x7b\xa9(?:[\x80-\xfe]|[\x40-\xff].)(?u:(.*))" -).unwrap(); -let text = b"\x12\xd0\x3b\x5f\x7b\xa9\x85\xe2\x98\x83\x80\x98\x54\x76\x68\x65"; -let caps = re.captures(text).unwrap(); - -// Notice that despite the `.*` at the end, it will only match valid UTF-8 -// because Unicode mode was enabled with the `u` flag. Without the `u` flag, -// the `.*` would match the rest of the bytes. -let mat = caps.get(1).unwrap(); -assert_eq!((7, 10), (mat.start(), mat.end())); - -// If there was a match, Unicode mode guarantees that `title` is valid UTF-8. -let title = str::from_utf8(&caps[1]).unwrap(); -assert_eq!("☃", title); -``` - -In general, if the Unicode flag is enabled in a capture group and that capture -is part of the overall match, then the capture is *guaranteed* to be valid -UTF-8. - -# Syntax - -The supported syntax is pretty much the same as the syntax for Unicode -regular expressions with a few changes that make sense for matching arbitrary -bytes: - -1. The `u` flag can be disabled even when disabling it might cause the regex to -match invalid UTF-8. When the `u` flag is disabled, the regex is said to be in -"ASCII compatible" mode. -2. In ASCII compatible mode, neither Unicode scalar values nor Unicode -character classes are allowed. -3. In ASCII compatible mode, Perl character classes (`\w`, `\d` and `\s`) -revert to their typical ASCII definition. `\w` maps to `[[:word:]]`, `\d` maps -to `[[:digit:]]` and `\s` maps to `[[:space:]]`. -4. In ASCII compatible mode, word boundaries use the ASCII compatible `\w` to -determine whether a byte is a word byte or not. -5. Hexadecimal notation can be used to specify arbitrary bytes instead of -Unicode codepoints. For example, in ASCII compatible mode, `\xFF` matches the -literal byte `\xFF`, while in Unicode mode, `\xFF` is a Unicode codepoint that -matches its UTF-8 encoding of `\xC3\xBF`. Similarly for octal notation when -enabled. -6. In ASCII compatible mode, `.` matches any *byte* except for `\n`. When the -`s` flag is additionally enabled, `.` matches any byte. - -# Performance +pub use crate::error::Error; -In general, one should expect performance on `&[u8]` to be roughly similar to -performance on `&str`. -*/ -#[cfg(feature = "std")] -pub mod bytes { - pub use crate::re_builder::bytes::*; - pub use crate::re_builder::set_bytes::*; - pub use crate::re_bytes::*; - pub use crate::re_set::bytes::*; -} +pub use crate::{builders::string::*, regex::string::*, regexset::string::*}; +mod builders; +pub mod bytes; mod error; mod find_byte; #[cfg(feature = "pattern")] mod pattern; -mod re_builder; -mod re_bytes; -mod re_set; -mod re_unicode; +mod regex; +mod regexset; + +/// Escapes all regular expression meta characters in `pattern`. +/// +/// The string returned may be safely used as a literal in a regular +/// expression. +pub fn escape(pattern: &str) -> alloc::string::String { + regex_syntax::escape(pattern) +} diff --git a/src/pattern.rs b/src/pattern.rs index 00549e5106..2db04d8b35 100644 --- a/src/pattern.rs +++ b/src/pattern.rs @@ -1,6 +1,6 @@ -use std::str::pattern::{Pattern, SearchStep, Searcher}; +use core::str::pattern::{Pattern, SearchStep, Searcher}; -use crate::re_unicode::{Matches, Regex}; +use crate::{Matches, Regex}; #[derive(Debug)] pub struct RegexSearcher<'r, 't> { diff --git a/src/re_builder.rs b/src/re_builder.rs deleted file mode 100644 index 5259ab0b0a..0000000000 --- a/src/re_builder.rs +++ /dev/null @@ -1,434 +0,0 @@ -use regex_automata::util::syntax; - -/// The set of user configurable options for compiling zero or more regexes. -/// This is shared among all top-level regex APIs. -#[derive(Clone, Debug)] -#[allow(missing_docs)] -struct RegexOptions { - pats: Vec<String>, - size_limit: usize, - dfa_size_limit: usize, - syntax: syntax::Config, -} - -impl Default for RegexOptions { - fn default() -> Self { - RegexOptions { - pats: vec![], - size_limit: 10 * (1 << 20), - dfa_size_limit: 2 * (1 << 20), - syntax: syntax::Config::default(), - } - } -} - -macro_rules! define_builder { - ($name:ident, $regex_mod:ident, $utf8:expr) => { - pub mod $name { - use std::sync::Arc; - - use regex_automata::meta; - - use crate::{error::Error, $regex_mod::Regex}; - - use super::RegexOptions; - - /// A configurable builder for a regular expression. - /// - /// A builder can be used to configure how the regex is built, for example, by - /// setting the default flags (which can be overridden in the expression - /// itself) or setting various limits. - #[derive(Debug)] - pub struct RegexBuilder(RegexOptions); - - impl RegexBuilder { - /// Create a new regular expression builder with the given pattern. - /// - /// If the pattern is invalid, then an error will be returned when - /// `build` is called. - pub fn new(pattern: &str) -> RegexBuilder { - let mut builder = RegexBuilder(RegexOptions::default()); - builder.0.pats.push(pattern.to_owned()); - builder - } - - /// Consume the builder and compile the regular expression. - /// - /// Note that calling `as_str` on the resulting `Regex` will produce the - /// pattern given to `new` verbatim. Notably, it will not incorporate any - /// of the flags set on this builder. - pub fn build(&self) -> Result<Regex, Error> { - let config = meta::Config::new() - .match_kind(regex_automata::MatchKind::LeftmostFirst) - .utf8_empty($utf8) - .nfa_size_limit(Some(self.0.size_limit)) - .hybrid_cache_capacity(self.0.dfa_size_limit); - meta::Builder::new() - .configure(config) - .syntax(self.0.syntax.clone().utf8($utf8)) - .build(&self.0.pats[0]) - .map(|meta| Regex { - meta, - pattern: Arc::from(self.0.pats[0].as_str()), - }) - .map_err(Error::from_meta_build_error) - } - - /// Set the value for the case insensitive (`i`) flag. - /// - /// When enabled, letters in the pattern will match both upper case and - /// lower case variants. - pub fn case_insensitive( - &mut self, - yes: bool, - ) -> &mut RegexBuilder { - self.0.syntax = self.0.syntax.case_insensitive(yes); - self - } - - /// Set the value for the multi-line matching (`m`) flag. - /// - /// When enabled, `^` matches the beginning of lines and `$` matches the - /// end of lines. - /// - /// By default, they match beginning/end of the input. - pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { - self.0.syntax = self.0.syntax.multi_line(yes); - self - } - - /// Set the value for the any character (`s`) flag, where in `.` matches - /// anything when `s` is set and matches anything except for new line when - /// it is not set (the default). - /// - /// N.B. "matches anything" means "any byte" when Unicode is disabled and - /// means "any valid UTF-8 encoding of any Unicode scalar value" when - /// Unicode is enabled. - pub fn dot_matches_new_line( - &mut self, - yes: bool, - ) -> &mut RegexBuilder { - self.0.syntax = self.0.syntax.dot_matches_new_line(yes); - self - } - - /// Set the value for the greedy swap (`U`) flag. - /// - /// When enabled, a pattern like `a*` is lazy (tries to find shortest - /// match) and `a*?` is greedy (tries to find longest match). - /// - /// By default, `a*` is greedy and `a*?` is lazy. - pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { - self.0.syntax = self.0.syntax.swap_greed(yes); - self - } - - /// Set the value for the ignore whitespace (`x`) flag. - /// - /// When enabled, whitespace such as new lines and spaces will be ignored - /// between expressions of the pattern, and `#` can be used to start a - /// comment until the next new line. - pub fn ignore_whitespace( - &mut self, - yes: bool, - ) -> &mut RegexBuilder { - self.0.syntax = self.0.syntax.ignore_whitespace(yes); - self - } - - /// Set the value for the Unicode (`u`) flag. - /// - /// Enabled by default. When disabled, character classes such as `\w` only - /// match ASCII word characters instead of all Unicode word characters. - pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { - self.0.syntax = self.0.syntax.unicode(yes); - self - } - - /// Whether to support octal syntax or not. - /// - /// Octal syntax is a little-known way of uttering Unicode codepoints in - /// a regular expression. For example, `a`, `\x61`, `\u0061` and - /// `\141` are all equivalent regular expressions, where the last example - /// shows octal syntax. - /// - /// While supporting octal syntax isn't in and of itself a problem, it does - /// make good error messages harder. That is, in PCRE based regex engines, - /// syntax like `\0` invokes a backreference, which is explicitly - /// unsupported in Rust's regex engine. However, many users expect it to - /// be supported. Therefore, when octal support is disabled, the error - /// message will explicitly mention that backreferences aren't supported. - /// - /// Octal syntax is disabled by default. - pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { - self.0.syntax = self.0.syntax.octal(yes); - self - } - - /// Set the approximate size limit of the compiled regular expression. - /// - /// This roughly corresponds to the number of bytes occupied by a single - /// compiled program. If the program exceeds this number, then a - /// compilation error is returned. - pub fn size_limit( - &mut self, - limit: usize, - ) -> &mut RegexBuilder { - self.0.size_limit = limit; - self - } - - /// Set the approximate size of the cache used by the DFA. - /// - /// This roughly corresponds to the number of bytes that the DFA will - /// use while searching. - /// - /// Note that this is a *per thread* limit. There is no way to set a global - /// limit. In particular, if a regex is used from multiple threads - /// simultaneously, then each thread may use up to the number of bytes - /// specified here. - pub fn dfa_size_limit( - &mut self, - limit: usize, - ) -> &mut RegexBuilder { - self.0.dfa_size_limit = limit; - self - } - - /// Set the nesting limit for this parser. - /// - /// The nesting limit controls how deep the abstract syntax tree is allowed - /// to be. If the AST exceeds the given limit (e.g., with too many nested - /// groups), then an error is returned by the parser. - /// - /// The purpose of this limit is to act as a heuristic to prevent stack - /// overflow for consumers that do structural induction on an `Ast` using - /// explicit recursion. While this crate never does this (instead using - /// constant stack space and moving the call stack to the heap), other - /// crates may. - /// - /// This limit is not checked until the entire Ast is parsed. Therefore, - /// if callers want to put a limit on the amount of heap space used, then - /// they should impose a limit on the length, in bytes, of the concrete - /// pattern string. In particular, this is viable since this parser - /// implementation will limit itself to heap space proportional to the - /// length of the pattern string. - /// - /// Note that a nest limit of `0` will return a nest limit error for most - /// patterns but not all. For example, a nest limit of `0` permits `a` but - /// not `ab`, since `ab` requires a concatenation, which results in a nest - /// depth of `1`. In general, a nest limit is not something that manifests - /// in an obvious way in the concrete syntax, therefore, it should not be - /// used in a granular way. - pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { - self.0.syntax.nest_limit(limit); - self - } - } - } - }; -} - -define_builder!(bytes, re_bytes, false); -define_builder!(unicode, re_unicode, true); - -macro_rules! define_set_builder { - ($name:ident, $regex_mod:ident, $utf8:expr) => { - pub mod $name { - use std::sync::Arc; - - use regex_automata::meta; - - use crate::{error::Error, re_set::$regex_mod::RegexSet}; - - use super::RegexOptions; - - /// A configurable builder for a set of regular expressions. - /// - /// A builder can be used to configure how the regexes are built, for example, - /// by setting the default flags (which can be overridden in the expression - /// itself) or setting various limits. - #[derive(Debug)] - pub struct RegexSetBuilder(RegexOptions); - - impl RegexSetBuilder { - /// Create a new regular expression builder with the given pattern. - /// - /// If the pattern is invalid, then an error will be returned when - /// `build` is called. - pub fn new<I, S>(patterns: I) -> RegexSetBuilder - where - S: AsRef<str>, - I: IntoIterator<Item = S>, - { - let mut builder = RegexSetBuilder(RegexOptions::default()); - for pat in patterns { - builder.0.pats.push(pat.as_ref().to_owned()); - } - builder - } - - /// Consume the builder and compile the regular expressions into a set. - pub fn build(&self) -> Result<RegexSet, Error> { - let config = meta::Config::new() - .match_kind(regex_automata::MatchKind::All) - .utf8_empty($utf8) - .nfa_size_limit(Some(self.0.size_limit)) - .hybrid_cache_capacity(self.0.dfa_size_limit); - meta::Builder::new() - .configure(config) - .syntax(self.0.syntax.clone().utf8($utf8)) - .build_many(&self.0.pats) - .map(|meta| RegexSet { - meta, - patterns: Arc::from(&*self.0.pats), - }) - .map_err(Error::from_meta_build_error) - } - - /// Set the value for the case insensitive (`i`) flag. - pub fn case_insensitive( - &mut self, - yes: bool, - ) -> &mut RegexSetBuilder { - self.0.syntax = self.0.syntax.case_insensitive(yes); - self - } - - /// Set the value for the multi-line matching (`m`) flag. - pub fn multi_line( - &mut self, - yes: bool, - ) -> &mut RegexSetBuilder { - self.0.syntax = self.0.syntax.multi_line(yes); - self - } - - /// Set the value for the any character (`s`) flag, where in `.` matches - /// anything when `s` is set and matches anything except for new line when - /// it is not set (the default). - /// - /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` - /// expressions and means "any Unicode scalar value" for `regex::RegexSet` - /// expressions. - pub fn dot_matches_new_line( - &mut self, - yes: bool, - ) -> &mut RegexSetBuilder { - self.0.syntax = self.0.syntax.dot_matches_new_line(yes); - self - } - - /// Set the value for the greedy swap (`U`) flag. - pub fn swap_greed( - &mut self, - yes: bool, - ) -> &mut RegexSetBuilder { - self.0.syntax = self.0.syntax.swap_greed(yes); - self - } - - /// Set the value for the ignore whitespace (`x`) flag. - pub fn ignore_whitespace( - &mut self, - yes: bool, - ) -> &mut RegexSetBuilder { - self.0.syntax = self.0.syntax.ignore_whitespace(yes); - self - } - - /// Set the value for the Unicode (`u`) flag. - pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { - self.0.syntax = self.0.syntax.unicode(yes); - self - } - - /// Whether to support octal syntax or not. - /// - /// Octal syntax is a little-known way of uttering Unicode codepoints in - /// a regular expression. For example, `a`, `\x61`, `\u0061` and - /// `\141` are all equivalent regular expressions, where the last example - /// shows octal syntax. - /// - /// While supporting octal syntax isn't in and of itself a problem, it does - /// make good error messages harder. That is, in PCRE based regex engines, - /// syntax like `\0` invokes a backreference, which is explicitly - /// unsupported in Rust's regex engine. However, many users expect it to - /// be supported. Therefore, when octal support is disabled, the error - /// message will explicitly mention that backreferences aren't supported. - /// - /// Octal syntax is disabled by default. - pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { - self.0.syntax = self.0.syntax.octal(yes); - self - } - - /// Set the approximate size limit of the compiled regular expression. - /// - /// This roughly corresponds to the number of bytes occupied by a single - /// compiled program. If the program exceeds this number, then a - /// compilation error is returned. - pub fn size_limit( - &mut self, - limit: usize, - ) -> &mut RegexSetBuilder { - self.0.size_limit = limit; - self - } - - /// Set the approximate size of the cache used by the DFA. - /// - /// This roughly corresponds to the number of bytes that the DFA will - /// use while searching. - /// - /// Note that this is a *per thread* limit. There is no way to set a global - /// limit. In particular, if a regex is used from multiple threads - /// simultaneously, then each thread may use up to the number of bytes - /// specified here. - pub fn dfa_size_limit( - &mut self, - limit: usize, - ) -> &mut RegexSetBuilder { - self.0.dfa_size_limit = limit; - self - } - - /// Set the nesting limit for this parser. - /// - /// The nesting limit controls how deep the abstract syntax tree is allowed - /// to be. If the AST exceeds the given limit (e.g., with too many nested - /// groups), then an error is returned by the parser. - /// - /// The purpose of this limit is to act as a heuristic to prevent stack - /// overflow for consumers that do structural induction on an `Ast` using - /// explicit recursion. While this crate never does this (instead using - /// constant stack space and moving the call stack to the heap), other - /// crates may. - /// - /// This limit is not checked until the entire Ast is parsed. Therefore, - /// if callers want to put a limit on the amount of heap space used, then - /// they should impose a limit on the length, in bytes, of the concrete - /// pattern string. In particular, this is viable since this parser - /// implementation will limit itself to heap space proportional to the - /// length of the pattern string. - /// - /// Note that a nest limit of `0` will return a nest limit error for most - /// patterns but not all. For example, a nest limit of `0` permits `a` but - /// not `ab`, since `ab` requires a concatenation, which results in a nest - /// depth of `1`. In general, a nest limit is not something that manifests - /// in an obvious way in the concrete syntax, therefore, it should not be - /// used in a granular way. - pub fn nest_limit( - &mut self, - limit: u32, - ) -> &mut RegexSetBuilder { - self.0.syntax.nest_limit(limit); - self - } - } - } - }; -} - -define_set_builder!(set_bytes, bytes, false); -define_set_builder!(set_unicode, unicode, true); diff --git a/src/re_bytes.rs b/src/re_bytes.rs deleted file mode 100644 index 38a6664100..0000000000 --- a/src/re_bytes.rs +++ /dev/null @@ -1,1394 +0,0 @@ -use std::{ - borrow::Cow, - fmt, - iter::FusedIterator, - ops::{Index, Range}, - str::FromStr, - sync::Arc, -}; - -use regex_automata::{meta, util::captures, Input, PatternID}; - -use crate::{ - error::Error, find_byte::find_byte, re_builder::bytes::RegexBuilder, -}; - -/// Match represents a single match of a regex in a haystack. -/// -/// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Eq, PartialEq)] -pub struct Match<'t> { - text: &'t [u8], - start: usize, - end: usize, -} - -impl<'t> Match<'t> { - /// Returns the starting byte offset of the match in the haystack. - #[inline] - pub fn start(&self) -> usize { - self.start - } - - /// Returns the ending byte offset of the match in the haystack. - #[inline] - pub fn end(&self) -> usize { - self.end - } - - /// Returns true if and only if this match has a length of zero. - #[inline] - pub fn is_empty(&self) -> bool { - self.start == self.end - } - - /// Returns the length, in bytes, of this match. - #[inline] - pub fn len(&self) -> usize { - self.end - self.start - } - - /// Returns the range over the starting and ending byte offsets of the - /// match in the haystack. - #[inline] - pub fn range(&self) -> Range<usize> { - self.start..self.end - } - - /// Returns the matched text. - #[inline] - pub fn as_bytes(&self) -> &'t [u8] { - &self.text[self.range()] - } - - /// Creates a new match from the given haystack and byte offsets. - #[inline] - fn new(haystack: &'t [u8], start: usize, end: usize) -> Match<'t> { - Match { text: haystack, start, end } - } -} - -impl<'t> std::fmt::Debug for Match<'t> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - let mut fmt = f.debug_struct("Match"); - fmt.field("start", &self.start).field("end", &self.end); - if let Ok(s) = std::str::from_utf8(self.as_bytes()) { - fmt.field("bytes", &s); - } else { - // FIXME: It would be nice if this could be printed as a string - // with invalid UTF-8 replaced with hex escapes. A alloc would - // probably okay if that makes it easier, but regex-automata does - // (at time of writing) have internal routines that do this. So - // maybe we should expose them. - fmt.field("bytes", &self.as_bytes()); - } - fmt.finish() - } -} - -impl<'t> From<Match<'t>> for Range<usize> { - fn from(m: Match<'t>) -> Range<usize> { - m.range() - } -} - -/// A compiled regular expression for matching arbitrary bytes. -/// -/// It can be used to search, split or replace text. All searching is done with -/// an implicit `.*?` at the beginning and end of an expression. To force an -/// expression to match the whole string (or a prefix or a suffix), you must -/// use an anchor like `^` or `$` (or `\A` and `\z`). -/// -/// Like the `Regex` type in the parent module, matches with this regex return -/// byte offsets into the search text. **Unlike** the parent `Regex` type, -/// these byte offsets may not correspond to UTF-8 sequence boundaries since -/// the regexes in this module can match arbitrary bytes. -#[derive(Clone)] -pub struct Regex { - pub(crate) meta: meta::Regex, - pub(crate) pattern: Arc<str>, -} - -impl fmt::Display for Regex { - /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl fmt::Debug for Regex { - /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(self, f) - } -} - -impl FromStr for Regex { - type Err = Error; - - /// Attempts to parse a string into a regular expression - fn from_str(s: &str) -> Result<Regex, Error> { - Regex::new(s) - } -} - -/// Core regular expression methods. -impl Regex { - /// Compiles a regular expression. Once compiled, it can be used repeatedly - /// to search, split or replace text in a string. - /// - /// If an invalid expression is given, then an error is returned. - pub fn new(re: &str) -> Result<Regex, Error> { - RegexBuilder::new(re).build() - } - - /// Returns true if and only if there is a match for the regex in the - /// string given. - /// - /// It is recommended to use this method if all you need to do is test - /// a match, since the underlying matching engine may be able to do less - /// work. - /// - /// # Example - /// - /// Test if some text contains at least one word with exactly 13 ASCII word - /// bytes: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let text = b"I categorically deny having triskaidekaphobia."; - /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); - /// # } - /// ``` - #[inline] - pub fn is_match(&self, text: &[u8]) -> bool { - self.is_match_at(text, 0) - } - - /// Returns the start and end byte range of the leftmost-first match in - /// `text`. If no match exists, then `None` is returned. - /// - /// Note that this should only be used if you want to discover the position - /// of the match. Testing the existence of a match is faster if you use - /// `is_match`. - /// - /// # Example - /// - /// Find the start and end location of the first word with exactly 13 - /// ASCII word bytes: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let text = b"I categorically deny having triskaidekaphobia."; - /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); - /// assert_eq!((mat.start(), mat.end()), (2, 15)); - /// # } - /// ``` - #[inline] - pub fn find<'t>(&self, text: &'t [u8]) -> Option<Match<'t>> { - self.find_at(text, 0) - } - - /// Returns an iterator for each successive non-overlapping match in - /// `text`, returning the start and end byte indices with respect to - /// `text`. - /// - /// # Example - /// - /// Find the start and end location of every word with exactly 13 ASCII - /// word bytes: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let text = b"Retroactively relinquishing remunerations is reprehensible."; - /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", mat); - /// } - /// # } - /// ``` - #[inline] - pub fn find_iter<'r, 't>(&'r self, text: &'t [u8]) -> Matches<'r, 't> { - Matches { text, it: self.meta.find_iter(text) } - } - - /// Returns the capture groups corresponding to the leftmost-first - /// match in `text`. Capture group `0` always corresponds to the entire - /// match. If no match is found, then `None` is returned. - /// - /// You should only use `captures` if you need access to the location of - /// capturing group matches. Otherwise, `find` is faster for discovering - /// the location of the overall match. - /// - /// # Examples - /// - /// Say you have some text with movie names and their release years, - /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text - /// looking like that, while also extracting the movie name and its release - /// year separately. - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); - /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.get(1).unwrap().as_bytes(), &b"Citizen Kane"[..]); - /// assert_eq!(caps.get(2).unwrap().as_bytes(), &b"1941"[..]); - /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); - /// // You can also access the groups by index using the Index notation. - /// // Note that this will panic on an invalid index. - /// assert_eq!(&caps[1], b"Citizen Kane"); - /// assert_eq!(&caps[2], b"1941"); - /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); - /// # } - /// ``` - /// - /// Note that the full match is at capture group `0`. Each subsequent - /// capture group is indexed by the order of its opening `(`. - /// - /// We can make this example a bit clearer by using *named* capture groups: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") - /// .unwrap(); - /// let text = b"Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane"); - /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941"); - /// assert_eq!(caps.get(0).unwrap().as_bytes(), &b"'Citizen Kane' (1941)"[..]); - /// // You can also access the groups by name using the Index notation. - /// // Note that this will panic on an invalid group name. - /// assert_eq!(&caps["title"], b"Citizen Kane"); - /// assert_eq!(&caps["year"], b"1941"); - /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); - /// - /// # } - /// ``` - /// - /// Here we name the capture groups, which we can access with the `name` - /// method or the `Index` notation with a `&str`. Note that the named - /// capture groups are still accessible with `get` or the `Index` notation - /// with a `usize`. - /// - /// The `0`th capture group is always unnamed, so it must always be - /// accessed with `get(0)` or `[0]`. - #[inline] - pub fn captures<'t>(&self, text: &'t [u8]) -> Option<Captures<'t>> { - self.captures_at(text, 0) - } - - /// Returns an iterator over all the non-overlapping capture groups matched - /// in `text`. This is operationally the same as `find_iter`, except it - /// yields information about capturing group matches. - /// - /// # Example - /// - /// We can use this to find all movie titles and their release years in - /// some text, where the movie is formatted like "'Title' (xxxx)": - /// - /// ```rust - /// # use std::str; use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") - /// .unwrap(); - /// let text = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; - /// for caps in re.captures_iter(text) { - /// let title = str::from_utf8(&caps["title"]).unwrap(); - /// let year = str::from_utf8(&caps["year"]).unwrap(); - /// println!("Movie: {:?}, Released: {:?}", title, year); - /// } - /// // Output: - /// // Movie: Citizen Kane, Released: 1941 - /// // Movie: The Wizard of Oz, Released: 1939 - /// // Movie: M, Released: 1931 - /// # } - /// ``` - #[inline] - pub fn captures_iter<'r, 't>( - &'r self, - text: &'t [u8], - ) -> CaptureMatches<'r, 't> { - CaptureMatches { text, it: self.meta.captures_iter(text) } - } - - /// Returns an iterator of substrings of `text` delimited by a match of the - /// regular expression. Namely, each element of the iterator corresponds to - /// text that *isn't* matched by the regular expression. - /// - /// This method will *not* copy the text given. - /// - /// # Example - /// - /// To split a string delimited by arbitrary amounts of spaces or tabs: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new(r"[ \t]+").unwrap(); - /// let fields: Vec<&[u8]> = re.split(b"a b \t c\td e").collect(); - /// assert_eq!(fields, vec![ - /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..], - /// ]); - /// # } - /// ``` - #[inline] - pub fn split<'r, 't>(&'r self, text: &'t [u8]) -> Split<'r, 't> { - Split { text, it: self.meta.split(text) } - } - - /// Returns an iterator of at most `limit` substrings of `text` delimited - /// by a match of the regular expression. (A `limit` of `0` will return no - /// substrings.) Namely, each element of the iterator corresponds to text - /// that *isn't* matched by the regular expression. The remainder of the - /// string that is not split will be the last element in the iterator. - /// - /// This method will *not* copy the text given. - /// - /// # Example - /// - /// Get the first two words in some text: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new(r"\W+").unwrap(); - /// let fields: Vec<&[u8]> = re.splitn(b"Hey! How are you?", 3).collect(); - /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]); - /// # } - /// ``` - #[inline] - pub fn splitn<'r, 't>( - &'r self, - text: &'t [u8], - limit: usize, - ) -> SplitN<'r, 't> { - SplitN { text, it: self.meta.splitn(text, limit) } - } - - /// Replaces the leftmost-first match with the replacement provided. The - /// replacement can be a regular byte string (where `$N` and `$name` are - /// expanded to match capture groups) or a function that takes the matches' - /// `Captures` and returns the replaced byte string. - /// - /// If no match is found, then a copy of the byte string is returned - /// unchanged. - /// - /// # Replacement string syntax - /// - /// All instances of `$name` in the replacement text is replaced with the - /// corresponding capture group `name`. - /// - /// `name` may be an integer corresponding to the index of the - /// capture group (counted by order of opening parenthesis where `0` is the - /// entire match) or it can be a name (consisting of letters, digits or - /// underscores) corresponding to a named capture group. - /// - /// If `name` isn't a valid capture group (whether the name doesn't exist - /// or isn't a valid index), then it is replaced with the empty string. - /// - /// The longest possible name is used. e.g., `$1a` looks up the capture - /// group named `1a` and not the capture group at index `1`. To exert more - /// precise control over the name, use braces, e.g., `${1}a`. - /// - /// To write a literal `$` use `$$`. - /// - /// # Examples - /// - /// Note that this function is polymorphic with respect to the replacement. - /// In typical usage, this can just be a normal byte string: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new("[^01]+").unwrap(); - /// assert_eq!(re.replace(b"1078910", &b""[..]), &b"1010"[..]); - /// # } - /// ``` - /// - /// But anything satisfying the `Replacer` trait will work. For example, a - /// closure of type `|&Captures| -> Vec<u8>` provides direct access to the - /// captures corresponding to a match. This allows one to access capturing - /// group matches easily: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # use regex::bytes::Captures; fn main() { - /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); - /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| { - /// let mut replacement = caps[2].to_owned(); - /// replacement.push(b' '); - /// replacement.extend(&caps[1]); - /// replacement - /// }); - /// assert_eq!(result, &b"Bruce Springsteen"[..]); - /// # } - /// ``` - /// - /// But this is a bit cumbersome to use all the time. Instead, a simple - /// syntax is supported that expands `$name` into the corresponding capture - /// group. Here's the last example, but using this expansion technique - /// with named capture groups: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); - /// let result = re.replace(b"Springsteen, Bruce", &b"$first $last"[..]); - /// assert_eq!(result, &b"Bruce Springsteen"[..]); - /// # } - /// ``` - /// - /// Note that using `$2` instead of `$first` or `$1` instead of `$last` - /// would produce the same result. To write a literal `$` use `$$`. - /// - /// Sometimes the replacement string requires use of curly braces to - /// delineate a capture group replacement and surrounding literal text. - /// For example, if we wanted to join two words together with an - /// underscore: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); - /// let result = re.replace(b"deep fried", &b"${first}_$second"[..]); - /// assert_eq!(result, &b"deep_fried"[..]); - /// # } - /// ``` - /// - /// Without the curly braces, the capture group name `first_` would be - /// used, and since it doesn't exist, it would be replaced with the empty - /// string. - /// - /// Finally, sometimes you just want to replace a literal string with no - /// regard for capturing group expansion. This can be done by wrapping a - /// byte string with `NoExpand`: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// use regex::bytes::NoExpand; - /// - /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); - /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); - /// assert_eq!(result, &b"$2 $last"[..]); - /// # } - /// ``` - #[inline] - pub fn replace<'t, R: Replacer>( - &self, - text: &'t [u8], - rep: R, - ) -> Cow<'t, [u8]> { - self.replacen(text, 1, rep) - } - - /// Replaces all non-overlapping matches in `text` with the replacement - /// provided. This is the same as calling `replacen` with `limit` set to - /// `0`. - /// - /// See the documentation for `replace` for details on how to access - /// capturing group matches in the replacement text. - #[inline] - pub fn replace_all<'t, R: Replacer>( - &self, - text: &'t [u8], - rep: R, - ) -> Cow<'t, [u8]> { - self.replacen(text, 0, rep) - } - - /// Replaces at most `limit` non-overlapping matches in `text` with the - /// replacement provided. If `limit` is 0, then all non-overlapping matches - /// are replaced. - /// - /// See the documentation for `replace` for details on how to access - /// capturing group matches in the replacement text. - pub fn replacen<'t, R: Replacer>( - &self, - text: &'t [u8], - limit: usize, - mut rep: R, - ) -> Cow<'t, [u8]> { - if let Some(rep) = rep.no_expansion() { - let mut it = self.find_iter(text).enumerate().peekable(); - if it.peek().is_none() { - return Cow::Borrowed(text); - } - let mut new = Vec::with_capacity(text.len()); - let mut last_match = 0; - for (i, m) in it { - new.extend_from_slice(&text[last_match..m.start()]); - new.extend_from_slice(&rep); - last_match = m.end(); - if limit > 0 && i >= limit - 1 { - break; - } - } - new.extend_from_slice(&text[last_match..]); - return Cow::Owned(new); - } - - // The slower path, which we use if the replacement needs access to - // capture groups. - let mut it = self.captures_iter(text).enumerate().peekable(); - if it.peek().is_none() { - return Cow::Borrowed(text); - } - let mut new = Vec::with_capacity(text.len()); - let mut last_match = 0; - for (i, cap) in it { - // unwrap on 0 is OK because captures only reports matches - let m = cap.get(0).unwrap(); - new.extend_from_slice(&text[last_match..m.start()]); - rep.replace_append(&cap, &mut new); - last_match = m.end(); - if limit > 0 && i >= limit - 1 { - break; - } - } - new.extend_from_slice(&text[last_match..]); - Cow::Owned(new) - } -} - -/// Advanced or "lower level" search methods. -impl Regex { - /// Returns the end location of a match in the text given. - /// - /// This method may have the same performance characteristics as - /// `is_match`, except it provides an end location for a match. In - /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match that you would find via `Regex::find`. - /// - /// Note that it is not guaranteed that this routine finds the shortest or - /// "earliest" possible match. Instead, the main idea of this API is that - /// it returns the offset at the point at which the internal regex engine - /// has determined that a match has occurred. This may vary depending on - /// which internal regex engine is used, and thus, the offset itself may - /// change. - /// - /// # Example - /// - /// Typically, `a+` would match the entire first sequence of `a` in some - /// text, but `shortest_match` can give up as soon as it sees the first - /// `a`. - /// - /// ```rust - /// # use regex::bytes::Regex; - /// # fn main() { - /// let text = b"aaaaa"; - /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); - /// assert_eq!(pos, Some(1)); - /// # } - /// ``` - #[inline] - pub fn shortest_match(&self, text: &[u8]) -> Option<usize> { - self.shortest_match_at(text, 0) - } - - /// Returns the same as shortest_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn shortest_match_at( - &self, - text: &[u8], - start: usize, - ) -> Option<usize> { - let mut input = Input::new(text).earliest(true); - input.set_start(start); - self.meta.search_half(&input).map(|hm| hm.offset()) - } - - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn is_match_at(&self, text: &[u8], start: usize) -> bool { - let mut input = Input::new(text); - input.set_start(start); - self.meta.is_match(input) - } - - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn find_at<'t>( - &self, - text: &'t [u8], - start: usize, - ) -> Option<Match<'t>> { - let mut input = Input::new(text); - input.set_start(start); - self.meta.find(input).map(|m| Match::new(text, m.start(), m.end())) - } - - /// Returns the same as [`Regex::captures`], but starts the search at the - /// given offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn captures_at<'t>( - &self, - text: &'t [u8], - start: usize, - ) -> Option<Captures<'t>> { - let mut caps = self.meta.create_captures(); - let mut input = Input::new(text); - input.set_start(start); - self.meta.captures(input, &mut caps); - if caps.is_match() { - Some(Captures { text, caps }) - } else { - None - } - } - - /// This is like `captures`, but uses - /// [`CaptureLocations`](struct.CaptureLocations.html) - /// instead of - /// [`Captures`](struct.Captures.html) in order to amortize allocations. - /// - /// To create a `CaptureLocations` value, use the - /// `Regex::capture_locations` method. - /// - /// This returns the overall match if this was successful, which is always - /// equivalence to the `0`th capture group. - #[inline] - pub fn captures_read<'t>( - &self, - locs: &mut CaptureLocations, - text: &'t [u8], - ) -> Option<Match<'t>> { - self.captures_read_at(locs, text, 0) - } - - /// Returns the same as `captures_read`, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn captures_read_at<'t>( - &self, - locs: &mut CaptureLocations, - text: &'t [u8], - start: usize, - ) -> Option<Match<'t>> { - let mut input = Input::new(text); - input.set_start(start); - self.meta.captures(input, &mut locs.0); - locs.0.get_match().map(|m| Match::new(text, m.start(), m.end())) - } - - /// An undocumented alias for `captures_read_at`. - /// - /// The `regex-capi` crate previously used this routine, so to avoid - /// breaking that crate, we continue to provide the name as an undocumented - /// alias. - #[doc(hidden)] - #[inline] - pub fn read_captures_at<'t>( - &self, - locs: &mut CaptureLocations, - text: &'t [u8], - start: usize, - ) -> Option<Match<'t>> { - self.captures_read_at(locs, text, start) - } -} - -/// Auxiliary methods. -impl Regex { - /// Returns the original string of this regex. - #[inline] - pub fn as_str(&self) -> &str { - &self.pattern - } - - /// Returns an iterator over the capture names. - #[inline] - pub fn capture_names(&self) -> CaptureNames<'_> { - CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO)) - } - - /// Returns the number of captures. - #[inline] - pub fn captures_len(&self) -> usize { - self.meta.group_info().group_len(PatternID::ZERO) - } - - /// Returns the total number of capturing groups that appear in every - /// possible match. - /// - /// If the number of capture groups can vary depending on the match, then - /// this returns `None`. That is, a value is only returned when the number - /// of matching groups is invariant or "static." - /// - /// Note that like [`Regex::captures_len`], this **does** include the - /// implicit capturing group corresponding to the entire match. Therefore, - /// when a non-None value is returned, it is guaranteed to be at least `1`. - /// Stated differently, a return value of `Some(0)` is impossible. - /// - /// # Example - /// - /// This shows a few cases where a static number of capture groups is - /// available and a few cases where it is not. - /// - /// ``` - /// use regex::bytes::Regex; - /// - /// let len = |pattern| { - /// Regex::new(pattern).map(|re| re.static_captures_len()) - /// }; - /// - /// assert_eq!(Some(1), len("a")?); - /// assert_eq!(Some(2), len("(a)")?); - /// assert_eq!(Some(2), len("(a)|(b)")?); - /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); - /// assert_eq!(None, len("(a)|b")?); - /// assert_eq!(None, len("a|(b)")?); - /// assert_eq!(None, len("(b)*")?); - /// assert_eq!(Some(2), len("(b)+")?); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - #[inline] - pub fn static_captures_len(&self) -> Option<usize> { - self.meta.static_captures_len() - } - - /// Returns an empty set of capture locations that can be reused in - /// multiple calls to `captures_read` or `captures_read_at`. - #[inline] - pub fn capture_locations(&self) -> CaptureLocations { - CaptureLocations(self.meta.create_captures()) - } - - /// An alias for `capture_locations` to preserve backward compatibility. - /// - /// The `regex-capi` crate uses this method, so to avoid breaking that - /// crate, we continue to export it as an undocumented API. - #[doc(hidden)] - #[inline] - pub fn locations(&self) -> CaptureLocations { - self.capture_locations() - } -} - -/// An iterator over all non-overlapping matches for a particular string. -/// -/// The iterator yields a tuple of integers corresponding to the start and end -/// of the match. The indices are byte offsets. The iterator stops when no more -/// matches can be found. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the matched byte string. -#[derive(Debug)] -pub struct Matches<'r, 't> { - text: &'t [u8], - it: meta::FindMatches<'r, 't>, -} - -impl<'r, 't> Iterator for Matches<'r, 't> { - type Item = Match<'t>; - - #[inline] - fn next(&mut self) -> Option<Match<'t>> { - self.it.next().map(|sp| Match::new(self.text, sp.start(), sp.end())) - } - - #[inline] - fn count(self) -> usize { - self.it.count() - } -} - -impl<'r, 't> FusedIterator for Matches<'r, 't> {} - -/// An iterator that yields all non-overlapping capture groups matching a -/// particular regular expression. -/// -/// The iterator stops when no more matches can be found. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the matched byte string. -#[derive(Debug)] -pub struct CaptureMatches<'r, 't> { - text: &'t [u8], - it: meta::CapturesMatches<'r, 't>, -} - -impl<'r, 't> Iterator for CaptureMatches<'r, 't> { - type Item = Captures<'t>; - - #[inline] - fn next(&mut self) -> Option<Captures<'t>> { - self.it.next().map(|caps| Captures { text: self.text, caps }) - } - - #[inline] - fn count(self) -> usize { - self.it.count() - } -} - -impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} - -/// Yields all substrings delimited by a regular expression match. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the byte string being split. -#[derive(Debug)] -pub struct Split<'r, 't> { - text: &'t [u8], - it: meta::Split<'r, 't>, -} - -impl<'r, 't> Iterator for Split<'r, 't> { - type Item = &'t [u8]; - - #[inline] - fn next(&mut self) -> Option<&'t [u8]> { - self.it.next().map(|span| &self.text[span]) - } -} - -impl<'r, 't> FusedIterator for Split<'r, 't> {} - -/// Yields at most `N` substrings delimited by a regular expression match. -/// -/// The last substring will be whatever remains after splitting. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the byte string being split. -#[derive(Debug)] -pub struct SplitN<'r, 't> { - text: &'t [u8], - it: meta::SplitN<'r, 't>, -} - -impl<'r, 't> Iterator for SplitN<'r, 't> { - type Item = &'t [u8]; - - #[inline] - fn next(&mut self) -> Option<&'t [u8]> { - self.it.next().map(|span| &self.text[span]) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.it.size_hint() - } -} - -impl<'r, 't> FusedIterator for SplitN<'r, 't> {} - -/// An iterator over the names of all possible captures. -/// -/// `None` indicates an unnamed capture; the first element (capture 0, the -/// whole matched region) is always unnamed. -/// -/// `'r` is the lifetime of the compiled regular expression. -#[derive(Clone, Debug)] -pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>); - -impl<'r> Iterator for CaptureNames<'r> { - type Item = Option<&'r str>; - - #[inline] - fn next(&mut self) -> Option<Option<&'r str>> { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.0.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.0.count() - } -} - -impl<'r> ExactSizeIterator for CaptureNames<'r> {} - -impl<'r> FusedIterator for CaptureNames<'r> {} - -/// CaptureLocations is a low level representation of the raw offsets of each -/// submatch. -/// -/// You can think of this as a lower level -/// [`Captures`](struct.Captures.html), where this type does not support -/// named capturing groups directly and it does not borrow the text that these -/// offsets were matched on. -/// -/// Primarily, this type is useful when using the lower level `Regex` APIs -/// such as `read_captures`, which permits amortizing the allocation in which -/// capture match locations are stored. -/// -/// In order to build a value of this type, you'll need to call the -/// `capture_locations` method on the `Regex` being used to execute the search. -/// The value returned can then be reused in subsequent searches. -/// -/// # Example -/// -/// This example shows how to create and use `CaptureLocations` in a search. -/// -/// ``` -/// use regex::bytes::Regex; -/// -/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); -/// let mut locs = re.capture_locations(); -/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); -/// assert_eq!(0..17, m.range()); -/// assert_eq!(Some((0, 17)), locs.get(0)); -/// assert_eq!(Some((0, 5)), locs.get(1)); -/// assert_eq!(Some((6, 17)), locs.get(2)); -/// -/// // Asking for an invalid capture group always returns None. -/// assert_eq!(None, locs.get(3)); -/// assert_eq!(None, locs.get(34973498648)); -/// assert_eq!(None, locs.get(9944060567225171988)); -/// ``` -#[derive(Clone, Debug)] -pub struct CaptureLocations(captures::Captures); - -/// A type alias for `CaptureLocations` for backwards compatibility. -/// -/// Previously, we exported `CaptureLocations` as `Locations` in an -/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), -/// we continue re-exporting the same undocumented API. -#[doc(hidden)] -pub type Locations = CaptureLocations; - -impl CaptureLocations { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original string matched. - #[inline] - pub fn get(&self, i: usize) -> Option<(usize, usize)> { - self.0.get_group(i).map(|sp| (sp.start, sp.end)) - } - - /// Returns the total number of capture groups (even if they didn't match). - /// - /// This is always at least `1` since every regex has at least `1` - /// capturing group that corresponds to the entire match. - #[inline] - pub fn len(&self) -> usize { - self.0.group_len() - } - - /// An alias for the `get` method for backwards compatibility. - /// - /// Previously, we exported `get` as `pos` in an undocumented API. To - /// prevent breaking that code (e.g., in `regex-capi`), we continue - /// re-exporting the same undocumented API. - #[doc(hidden)] - #[inline] - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - self.get(i) - } -} - -/// Captures represents a group of captured byte strings for a single match. -/// -/// The 0th capture always corresponds to the entire match. Each subsequent -/// index corresponds to the next capture group in the regex. If a capture -/// group is named, then the matched byte string is *also* available via the -/// `name` method. (Note that the 0th capture is always unnamed and so must be -/// accessed with the `get` method.) -/// -/// Positions returned from a capture group are always byte indices. -/// -/// `'t` is the lifetime of the matched text. -pub struct Captures<'t> { - text: &'t [u8], - caps: captures::Captures, -} - -impl<'t> Captures<'t> { - /// Returns the match associated with the capture group at index `i`. If - /// `i` does not correspond to a capture group, or if the capture group - /// did not participate in the match, then `None` is returned. - /// - /// # Examples - /// - /// Get the text of the match with a default of an empty string if this - /// group didn't participate in the match: - /// - /// ```rust - /// # use regex::bytes::Regex; - /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); - /// let caps = re.captures(b"abc123").unwrap(); - /// - /// let text1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); - /// let text2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); - /// assert_eq!(text1, &b"123"[..]); - /// assert_eq!(text2, &b""[..]); - /// ``` - #[inline] - pub fn get(&self, i: usize) -> Option<Match<'t>> { - self.caps - .get_group(i) - .map(|sp| Match::new(self.text, sp.start, sp.end)) - } - - /// Returns the match for the capture group named `name`. If `name` isn't a - /// valid capture group or didn't match anything, then `None` is returned. - #[inline] - pub fn name(&self, name: &str) -> Option<Match<'t>> { - self.caps - .get_group_by_name(name) - .map(|sp| Match::new(self.text, sp.start, sp.end)) - } - - /// An iterator that yields all capturing matches in the order in which - /// they appear in the regex. If a particular capture group didn't - /// participate in the match, then `None` is yielded for that capture. - /// - /// The first match always corresponds to the overall match of the regex. - #[inline] - pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { - SubCaptureMatches { text: self.text, it: self.caps.iter() } - } - - /// Expands all instances of `$name` in `replacement` to the corresponding - /// capture group `name`, and writes them to the `dst` buffer given. - /// - /// `name` may be an integer corresponding to the index of the capture - /// group (counted by order of opening parenthesis where `0` is the - /// entire match) or it can be a name (consisting of letters, digits or - /// underscores) corresponding to a named capture group. - /// - /// If `name` isn't a valid capture group (whether the name doesn't exist - /// or isn't a valid index), then it is replaced with the empty string. - /// - /// The longest possible name consisting of the characters `[_0-9A-Za-z]` - /// is used. e.g., `$1a` looks up the capture group named `1a` and not the - /// capture group at index `1`. To exert more precise control over the - /// name, or to refer to a capture group name that uses characters outside - /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When - /// using braces, any sequence of valid UTF-8 bytes is permitted. If the - /// sequence does not refer to a capture group name in the corresponding - /// regex, then it is replaced with an empty string. - /// - /// To write a literal `$` use `$$`. - #[inline] - pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { - self.caps.interpolate_bytes_into(self.text, replacement, dst); - } - - /// Returns the total number of capture groups (even if they didn't match). - /// - /// This is always at least `1`, since every regex has at least one capture - /// group that corresponds to the full match. - #[inline] - pub fn len(&self) -> usize { - self.caps.group_len() - } -} - -impl<'t> fmt::Debug for Captures<'t> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("Captures").field(&self.caps).finish() - } -} - -/* -struct CapturesDebug<'c, 't>(&'c Captures<'t>); - -impl<'c, 't> fmt::Debug for CapturesDebug<'c, 't> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fn escape_bytes(bytes: &[u8]) -> String { - let mut s = String::new(); - for &b in bytes { - s.push_str(&escape_byte(b)); - } - s - } - - fn escape_byte(byte: u8) -> String { - use std::ascii::escape_default; - - let escaped: Vec<u8> = escape_default(byte).collect(); - String::from_utf8_lossy(&escaped).into_owned() - } - - // We'd like to show something nice here, even if it means an - // allocation to build a reverse index. - let slot_to_name: HashMap<&usize, &String> = - self.0.named_groups.iter().map(|(a, b)| (b, a)).collect(); - let mut map = f.debug_map(); - for (slot, m) in self.0.locs.iter().enumerate() { - let m = m.map(|(s, e)| escape_bytes(&self.0.text[s..e])); - if let Some(name) = slot_to_name.get(&slot) { - map.entry(&name, &m); - } else { - map.entry(&slot, &m); - } - } - map.finish() - } -} -*/ - -/// Get a group by index. -/// -/// `'t` is the lifetime of the matched text. -/// -/// The text can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `get()` instead. -/// -/// # Panics -/// -/// If there is no group at the given index. -impl<'t> Index<usize> for Captures<'t> { - type Output = [u8]; - - fn index(&self, i: usize) -> &[u8] { - self.get(i) - .map(|m| m.as_bytes()) - .unwrap_or_else(|| panic!("no group at index '{}'", i)) - } -} - -/// Get a group by name. -/// -/// `'t` is the lifetime of the matched text and `'i` is the lifetime -/// of the group name (the index). -/// -/// The text can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `name` instead. -/// -/// # Panics -/// -/// If there is no group named by the given value. -impl<'t, 'i> Index<&'i str> for Captures<'t> { - type Output = [u8]; - - fn index<'a>(&'a self, name: &'i str) -> &'a [u8] { - self.name(name) - .map(|m| m.as_bytes()) - .unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator that yields all capturing matches in the order in which they -/// appear in the regex. -/// -/// If a particular capture group didn't participate in the match, then `None` -/// is yielded for that capture. The first match always corresponds to the -/// overall match of the regex. -/// -/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and -/// the lifetime `'t` corresponds to the originally matched text. -#[derive(Clone, Debug)] -pub struct SubCaptureMatches<'c, 't> { - text: &'t [u8], - it: captures::CapturesPatternIter<'c>, -} - -impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { - type Item = Option<Match<'t>>; - - #[inline] - fn next(&mut self) -> Option<Option<Match<'t>>> { - self.it.next().map(|group| { - group.map(|sp| Match::new(self.text, sp.start, sp.end)) - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.it.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.it.count() - } -} - -impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} - -/// Replacer describes types that can be used to replace matches in a byte -/// string. -/// -/// In general, users of this crate shouldn't need to implement this trait, -/// since implementations are already provided for `&[u8]` along with other -/// variants of bytes types and `FnMut(&Captures) -> Vec<u8>` (or any -/// `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`), which covers most use cases. -pub trait Replacer { - /// Appends text to `dst` to replace the current match. - /// - /// The current match is represented by `caps`, which is guaranteed to - /// have a match at capture group `0`. - /// - /// For example, a no-op replacement would be - /// `dst.extend(&caps[0])`. - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>); - - /// Return a fixed unchanging replacement byte string. - /// - /// When doing replacements, if access to `Captures` is not needed (e.g., - /// the replacement byte string does not need `$` expansion), then it can - /// be beneficial to avoid finding sub-captures. - /// - /// In general, this is called once for every call to `replacen`. - fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { - None - } - - /// Return a `Replacer` that borrows and wraps this `Replacer`. - /// - /// This is useful when you want to take a generic `Replacer` (which might - /// not be cloneable) and use it without consuming it, so it can be used - /// more than once. - /// - /// # Example - /// - /// ``` - /// use regex::bytes::{Regex, Replacer}; - /// - /// fn replace_all_twice<R: Replacer>( - /// re: Regex, - /// src: &[u8], - /// mut rep: R, - /// ) -> Vec<u8> { - /// let dst = re.replace_all(src, rep.by_ref()); - /// let dst = re.replace_all(&dst, rep.by_ref()); - /// dst.into_owned() - /// } - /// ``` - fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { - ReplacerRef(self) - } -} - -/// By-reference adaptor for a `Replacer` -/// -/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). -#[derive(Debug)] -pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); - -impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { - self.0.replace_append(caps, dst) - } - fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { - self.0.no_expansion() - } -} - -impl<'a> Replacer for &'a [u8] { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { - caps.expand(*self, dst); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { - no_expansion(self) - } -} - -impl<'a> Replacer for &'a Vec<u8> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { - caps.expand(*self, dst); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { - no_expansion(self) - } -} - -impl Replacer for Vec<u8> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { - caps.expand(self, dst); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { - no_expansion(self) - } -} - -impl<'a> Replacer for Cow<'a, [u8]> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { - caps.expand(self.as_ref(), dst); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { - no_expansion(self) - } -} - -impl<'a> Replacer for &'a Cow<'a, [u8]> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { - caps.expand(self.as_ref(), dst); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { - no_expansion(self) - } -} - -fn no_expansion<T: AsRef<[u8]>>(t: &T) -> Option<Cow<'_, [u8]>> { - let s = t.as_ref(); - match find_byte(b'$', s) { - Some(_) => None, - None => Some(Cow::Borrowed(s)), - } -} - -impl<F, T> Replacer for F -where - F: FnMut(&Captures<'_>) -> T, - T: AsRef<[u8]>, -{ - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { - dst.extend_from_slice((*self)(caps).as_ref()); - } -} - -/// `NoExpand` indicates literal byte string replacement. -/// -/// It can be used with `replace` and `replace_all` to do a literal byte string -/// replacement without expanding `$name` to their corresponding capture -/// groups. This can be both convenient (to avoid escaping `$`, for example) -/// and performant (since capture groups don't need to be found). -/// -/// `'t` is the lifetime of the literal text. -#[derive(Clone, Debug)] -pub struct NoExpand<'t>(pub &'t [u8]); - -impl<'t> Replacer for NoExpand<'t> { - fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) { - dst.extend_from_slice(self.0); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { - Some(Cow::Borrowed(self.0)) - } -} diff --git a/src/re_set.rs b/src/re_set.rs deleted file mode 100644 index 837be4d832..0000000000 --- a/src/re_set.rs +++ /dev/null @@ -1,519 +0,0 @@ -macro_rules! define_set { - ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, - $(#[$doc_regexset_example:meta])* ) => { - pub mod $name { - use std::{fmt, iter, sync::Arc}; - - use regex_automata::{ - meta, - Input, PatternID, PatternSet, PatternSetIter, - }; - - use crate::{ - error::Error, - re_builder::$builder_mod::RegexSetBuilder, - }; - -/// Match multiple (possibly overlapping) regular expressions in a single scan. -/// -/// A regex set corresponds to the union of two or more regular expressions. -/// That is, a regex set will match text where at least one of its -/// constituent regular expressions matches. A regex set as its formulated here -/// provides a touch more power: it will also report *which* regular -/// expressions in the set match. Indeed, this is the key difference between -/// regex sets and a single `Regex` with many alternates, since only one -/// alternate can match at a time. -/// -/// For example, consider regular expressions to match email addresses and -/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a -/// regex set is constructed from those regexes, then searching the text -/// `foo@example.com` will report both regexes as matching. Of course, one -/// could accomplish this by compiling each regex on its own and doing two -/// searches over the text. The key advantage of using a regex set is that it -/// will report the matching regexes using a *single pass through the text*. -/// If one has hundreds or thousands of regexes to match repeatedly (like a URL -/// router for a complex web application or a user agent matcher), then a regex -/// set can realize huge performance gains. -/// -/// # Example -/// -/// This shows how the above two regexes (for matching email addresses and -/// domains) might work: -/// -$(#[$doc_regexset_example])* -/// -/// Note that it would be possible to adapt the above example to using `Regex` -/// with an expression like: -/// -/// ```text -/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) -/// ``` -/// -/// After a match, one could then inspect the capture groups to figure out -/// which alternates matched. The problem is that it is hard to make this -/// approach scale when there are many regexes since the overlap between each -/// alternate isn't always obvious to reason about. -/// -/// # Limitations -/// -/// Regex sets are limited to answering the following two questions: -/// -/// 1. Does any regex in the set match? -/// 2. If so, which regexes in the set match? -/// -/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) -/// instead of (2) since the matching engines can stop after the first match -/// is found. -/// -/// You cannot directly extract [`Match`][crate::Match] or -/// [`Captures`][crate::Captures] objects from a regex set. If you need these -/// operations, the recommended approach is to compile each pattern in the set -/// independently and scan the exact same input a second time with those -/// independently compiled patterns: -/// -/// ```rust -/// use regex::{Regex, RegexSet}; -/// -/// let patterns = ["foo", "bar"]; -/// // Both patterns will match different ranges of this string. -/// let text = "barfoo"; -/// -/// // Compile a set matching any of our patterns. -/// let set = RegexSet::new(&patterns).unwrap(); -/// // Compile each pattern independently. -/// let regexes: Vec<_> = set.patterns().iter() -/// .map(|pat| Regex::new(pat).unwrap()) -/// .collect(); -/// -/// // Match against the whole set first and identify the individual -/// // matching patterns. -/// let matches: Vec<&str> = set.matches(text).into_iter() -/// // Dereference the match index to get the corresponding -/// // compiled pattern. -/// .map(|match_idx| ®exes[match_idx]) -/// // To get match locations or any other info, we then have to search -/// // the exact same text again, using our separately-compiled pattern. -/// .map(|pat| pat.find(text).unwrap().as_str()) -/// .collect(); -/// -/// // Matches arrive in the order the constituent patterns were declared, -/// // not the order they appear in the input. -/// assert_eq!(vec!["foo", "bar"], matches); -/// ``` -/// -/// # Performance -/// -/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, -/// search takes `O(mn)` time, where `m` is proportional to the size of the -/// regex set and `n` is proportional to the length of the search text. -#[derive(Clone)] -pub struct RegexSet { - pub(crate) meta: meta::Regex, - pub(crate) patterns: Arc<[String]>, -} - -impl RegexSet { - /// Create a new regex set with the given regular expressions. - /// - /// This takes an iterator of `S`, where `S` is something that can produce - /// a `&str`. If any of the strings in the iterator are not valid regular - /// expressions, then an error is returned. - /// - /// # Example - /// - /// Create a new regex set from an iterator of strings: - /// - /// ```rust - /// # use regex::RegexSet; - /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); - /// assert!(set.is_match("foo")); - /// ``` - pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> - where S: AsRef<str>, I: IntoIterator<Item=S> { - RegexSetBuilder::new(exprs).build() - } - - /// Create a new empty regex set. - /// - /// # Example - /// - /// ```rust - /// # use regex::RegexSet; - /// let set = RegexSet::empty(); - /// assert!(set.is_empty()); - /// ``` - pub fn empty() -> RegexSet { - let empty: [&str; 0] = []; - RegexSetBuilder::new(empty).build().unwrap() - } - - /// Returns true if and only if one of the regexes in this set matches - /// the text given. - /// - /// This method should be preferred if you only need to test whether any - /// of the regexes in the set should match, but don't care about *which* - /// regexes matched. This is because the underlying matching engine will - /// quit immediately after seeing the first match instead of continuing to - /// find all matches. - /// - /// Note that as with searches using `Regex`, the expression is unanchored - /// by default. That is, if the regex does not start with `^` or `\A`, or - /// end with `$` or `\z`, then it is permitted to match anywhere in the - /// text. - /// - /// # Example - /// - /// Tests whether a set matches some text: - /// - /// ```rust - /// # use regex::RegexSet; - /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); - /// assert!(set.is_match("foo")); - /// assert!(!set.is_match("☃")); - /// ``` - #[inline] - pub fn is_match(&self, text: $text_ty) -> bool { - self.is_match_at(text, 0) - } - - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[doc(hidden)] - #[inline] - pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { - let mut input = Input::new(text); - input.set_start(start); - self.meta.is_match(input) - } - - /// Returns the set of regular expressions that match in the given text. - /// - /// The set returned contains the index of each regular expression that - /// matches in the given text. The index is in correspondence with the - /// order of regular expressions given to `RegexSet`'s constructor. - /// - /// The set can also be used to iterate over the matched indices. - /// - /// Note that as with searches using `Regex`, the expression is unanchored - /// by default. That is, if the regex does not start with `^` or `\A`, or - /// end with `$` or `\z`, then it is permitted to match anywhere in the - /// text. - /// - /// # Example - /// - /// Tests which regular expressions match the given text: - /// - /// ```rust - /// # use regex::RegexSet; - /// let set = RegexSet::new(&[ - /// r"\w+", - /// r"\d+", - /// r"\pL+", - /// r"foo", - /// r"bar", - /// r"barfoo", - /// r"foobar", - /// ]).unwrap(); - /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); - /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); - /// - /// // You can also test whether a particular regex matched: - /// let matches = set.matches("foobar"); - /// assert!(!matches.matched(5)); - /// assert!(matches.matched(6)); - /// ``` - pub fn matches(&self, text: $text_ty) -> SetMatches { - let mut patset = PatternSet::new(self.meta.pattern_len()); - let input = Input::new(text); - self.meta.which_overlapping_matches(&input, &mut patset); - SetMatches(patset) - } - - /// Returns the same as matches, but starts the search at the given - /// offset and stores the matches into the slice given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - /// - /// `matches` must have a length that is at least the number of regexes - /// in this set. - /// - /// This method returns true if and only if at least one member of - /// `matches` is true after executing the set against `text`. - #[doc(hidden)] - pub fn read_matches_at( - &self, - matches: &mut [bool], - text: $text_ty, - start: usize, - ) -> bool { - // This is pretty dumb. We should try to fix this, but the - // regex-automata API doesn't provide a way to store matches in an - // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and - // thus not public... But regex-capi currently uses it. We should - // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet - // is in regex-automata, not regex. So maybe we should just accept a - // 'SetMatches', which is basically just a newtype around PatternSet. - let mut patset = PatternSet::new(self.meta.pattern_len()); - let mut input = Input::new(text); - input.set_start(start); - self.meta.which_overlapping_matches(&input, &mut patset); - for pid in patset.iter() { - matches[pid] = true; - } - !patset.is_empty() - } - - /// Returns the total number of regular expressions in this set. - pub fn len(&self) -> usize { - self.meta.pattern_len() - } - - /// Returns `true` if this set contains no regular expressions. - pub fn is_empty(&self) -> bool { - self.meta.pattern_len() == 0 - } - - /// Returns the patterns that this set will match on. - /// - /// This function can be used to determine the pattern for a match. The - /// slice returned has exactly as many patterns givens to this regex set, - /// and the order of the slice is the same as the order of the patterns - /// provided to the set. - /// - /// # Example - /// - /// ```rust - /// # use regex::RegexSet; - /// let set = RegexSet::new(&[ - /// r"\w+", - /// r"\d+", - /// r"\pL+", - /// r"foo", - /// r"bar", - /// r"barfoo", - /// r"foobar", - /// ]).unwrap(); - /// let matches: Vec<_> = set - /// .matches("foobar") - /// .into_iter() - /// .map(|match_idx| &set.patterns()[match_idx]) - /// .collect(); - /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); - /// ``` - pub fn patterns(&self) -> &[String] { - &self.patterns - } -} - -impl Default for RegexSet { - fn default() -> Self { - RegexSet::empty() - } -} - -/// A set of matches returned by a regex set. -#[derive(Clone, Debug)] -pub struct SetMatches(PatternSet); - -impl SetMatches { - /// Whether this set contains any matches. - pub fn matched_any(&self) -> bool { - !self.0.is_empty() - } - - /// Whether the regex at the given index matched. - /// - /// The index for a regex is determined by its insertion order upon the - /// initial construction of a `RegexSet`, starting at `0`. - /// - /// # Panics - /// - /// If `regex_index` is greater than or equal to `self.len()`. - pub fn matched(&self, regex_index: usize) -> bool { - self.0.contains(PatternID::new_unchecked(regex_index)) - } - - /// The total number of regexes in the set that created these matches. - /// - /// **WARNING:** This always returns the same value as [`RegexSet::len`]. - /// In particular, it does *not* return the number of elements yielded by - /// [`SetMatches::iter`]. The only way to determine the total number of - /// matched regexes is to iterate over them. - pub fn len(&self) -> usize { - self.0.capacity() - } - - /// Returns an iterator over indexes in the regex that matched. - /// - /// This will always produces matches in ascending order of index, where - /// the index corresponds to the index of the regex that matched with - /// respect to its position when initially building the set. - pub fn iter(&self) -> SetMatchesIter<'_> { - SetMatchesIter(self.0.iter()) - } -} - -impl IntoIterator for SetMatches { - type IntoIter = SetMatchesIntoIter; - type Item = usize; - - fn into_iter(self) -> Self::IntoIter { - let it = 0..self.0.capacity(); - SetMatchesIntoIter { - patset: self.0, it - } - } -} - -impl<'a> IntoIterator for &'a SetMatches { - type IntoIter = SetMatchesIter<'a>; - type Item = usize; - - fn into_iter(self) -> Self::IntoIter { - self.iter() - } -} - -/// An owned iterator over the set of matches from a regex set. -/// -/// This will always produces matches in ascending order of index, where the -/// index corresponds to the index of the regex that matched with respect to -/// its position when initially building the set. -#[derive(Debug)] -pub struct SetMatchesIntoIter { - patset: PatternSet, - it: std::ops::Range<usize>, -} - -impl Iterator for SetMatchesIntoIter { - type Item = usize; - - fn next(&mut self) -> Option<usize> { - loop { - let id = self.it.next()?; - if self.patset.contains(PatternID::new_unchecked(id)) { - return Some(id); - } - } - } - - fn size_hint(&self) -> (usize, Option<usize>) { - self.it.size_hint() - } -} - -impl DoubleEndedIterator for SetMatchesIntoIter { - fn next_back(&mut self) -> Option<usize> { - loop { - let id = self.it.next_back()?; - if self.patset.contains(PatternID::new_unchecked(id)) { - return Some(id); - } - } - } -} - -impl iter::FusedIterator for SetMatchesIntoIter {} - -/// A borrowed iterator over the set of matches from a regex set. -/// -/// The lifetime `'a` refers to the lifetime of a `SetMatches` value. -/// -/// This will always produces matches in ascending order of index, where the -/// index corresponds to the index of the regex that matched with respect to -/// its position when initially building the set. -#[derive(Clone, Debug)] -pub struct SetMatchesIter<'a>(PatternSetIter<'a>); - -impl<'a> Iterator for SetMatchesIter<'a> { - type Item = usize; - - fn next(&mut self) -> Option<usize> { - self.0.next().map(|pid| pid.as_usize()) - } - - fn size_hint(&self) -> (usize, Option<usize>) { - self.0.size_hint() - } -} - -impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { - fn next_back(&mut self) -> Option<usize> { - self.0.next_back().map(|pid| pid.as_usize()) - } -} - -impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} - -impl fmt::Debug for RegexSet { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "RegexSet({:?})", self.patterns()) - } -} - } - } -} - -define_set! { - unicode, - set_unicode, - &str, - as_bytes_str, -/// ```rust -/// # use regex::RegexSet; -/// let set = RegexSet::new(&[ -/// r"[a-z]+@[a-z]+\.(com|org|net)", -/// r"[a-z]+\.(com|org|net)", -/// ]).unwrap(); -/// -/// // Ask whether any regexes in the set match. -/// assert!(set.is_match("foo@example.com")); -/// -/// // Identify which regexes in the set match. -/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); -/// assert_eq!(vec![0, 1], matches); -/// -/// // Try again, but with text that only matches one of the regexes. -/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); -/// assert_eq!(vec![1], matches); -/// -/// // Try again, but with text that doesn't match any regex in the set. -/// let matches: Vec<_> = set.matches("example").into_iter().collect(); -/// assert!(matches.is_empty()); -/// ``` -} - -define_set! { - bytes, - set_bytes, - &[u8], - as_bytes_bytes, -/// ```rust -/// # use regex::bytes::RegexSet; -/// let set = RegexSet::new(&[ -/// r"[a-z]+@[a-z]+\.(com|org|net)", -/// r"[a-z]+\.(com|org|net)", -/// ]).unwrap(); -/// -/// // Ask whether any regexes in the set match. -/// assert!(set.is_match(b"foo@example.com")); -/// -/// // Identify which regexes in the set match. -/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); -/// assert_eq!(vec![0, 1], matches); -/// -/// // Try again, but with text that only matches one of the regexes. -/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); -/// assert_eq!(vec![1], matches); -/// -/// // Try again, but with text that doesn't match any regex in the set. -/// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); -/// assert!(matches.is_empty()); -/// ``` -} diff --git a/src/re_unicode.rs b/src/re_unicode.rs deleted file mode 100644 index 32f4fb64c3..0000000000 --- a/src/re_unicode.rs +++ /dev/null @@ -1,1406 +0,0 @@ -use std::{ - borrow::Cow, - fmt, - iter::FusedIterator, - ops::{Index, Range}, - str::FromStr, - sync::Arc, -}; - -use regex_automata::{meta, util::captures, Input, PatternID}; - -use crate::{ - error::Error, find_byte::find_byte, re_builder::unicode::RegexBuilder, -}; - -/// Escapes all regular expression meta characters in `text`. -/// -/// The string returned may be safely used as a literal in a regular -/// expression. -pub fn escape(text: &str) -> String { - regex_syntax::escape(text) -} - -/// A compiled regular expression for matching Unicode strings. -/// -/// It is represented as either a sequence of bytecode instructions (dynamic) -/// or as a specialized Rust function (native). It can be used to search, split -/// or replace text. All searching is done with an implicit `.*?` at the -/// beginning and end of an expression. To force an expression to match the -/// whole string (or a prefix or a suffix), you must use an anchor like `^` or -/// `$` (or `\A` and `\z`). -/// -/// While this crate will handle Unicode strings (whether in the regular -/// expression or in the search text), all positions returned are **byte -/// indices**. Every byte index is guaranteed to be at a Unicode code point -/// boundary. -/// -/// The lifetimes `'r` and `'t` in this crate correspond to the lifetime of a -/// compiled regular expression and text to search, respectively. -/// -/// The only methods that allocate new strings are the string replacement -/// methods. All other methods (searching and splitting) return borrowed -/// pointers into the string given. -/// -/// # Examples -/// -/// Find the location of a US phone number: -/// -/// ```rust -/// # use regex::Regex; -/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); -/// let mat = re.find("phone: 111-222-3333").unwrap(); -/// assert_eq!((mat.start(), mat.end()), (7, 19)); -/// ``` -/// -/// # Using the `std::str::pattern` methods with `Regex` -/// -/// > **Note**: This section requires that this crate is compiled with the -/// > `pattern` Cargo feature enabled, which **requires nightly Rust**. -/// -/// Since `Regex` implements `Pattern`, you can use regexes with methods -/// defined on `&str`. For example, `is_match`, `find`, `find_iter` -/// and `split` can be replaced with `str::contains`, `str::find`, -/// `str::match_indices` and `str::split`. -/// -/// Here are some examples: -/// -/// ```rust,ignore -/// # use regex::Regex; -/// let re = Regex::new(r"\d+").unwrap(); -/// let haystack = "a111b222c"; -/// -/// assert!(haystack.contains(&re)); -/// assert_eq!(haystack.find(&re), Some(1)); -/// assert_eq!(haystack.match_indices(&re).collect::<Vec<_>>(), -/// vec![(1, "111"), (5, "222")]); -/// assert_eq!(haystack.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]); -/// ``` -#[derive(Clone)] -pub struct Regex { - pub(crate) meta: meta::Regex, - pub(crate) pattern: Arc<str>, -} - -impl fmt::Display for Regex { - /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "{}", self.as_str()) - } -} - -impl fmt::Debug for Regex { - /// Shows the original regular expression. - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(self, f) - } -} - -impl FromStr for Regex { - type Err = Error; - - /// Attempts to parse a string into a regular expression - fn from_str(s: &str) -> Result<Regex, Error> { - Regex::new(s) - } -} - -/// Core regular expression methods. -impl Regex { - /// Compiles a regular expression. Once compiled, it can be used repeatedly - /// to search, split or replace text in a string. - /// - /// If an invalid expression is given, then an error is returned. - pub fn new(re: &str) -> Result<Regex, Error> { - RegexBuilder::new(re).build() - } - - /// Returns true if and only if there is a match for the regex in the - /// string given. - /// - /// It is recommended to use this method if all you need to do is test - /// a match, since the underlying matching engine may be able to do less - /// work. - /// - /// # Example - /// - /// Test if some text contains at least one word with exactly 13 - /// Unicode word characters: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let text = "I categorically deny having triskaidekaphobia."; - /// assert!(Regex::new(r"\b\w{13}\b").unwrap().is_match(text)); - /// # } - /// ``` - #[inline] - pub fn is_match(&self, text: &str) -> bool { - self.is_match_at(text, 0) - } - - /// Returns the start and end byte range of the leftmost-first match in - /// `text`. If no match exists, then `None` is returned. - /// - /// Note that this should only be used if you want to discover the position - /// of the match. Testing the existence of a match is faster if you use - /// `is_match`. - /// - /// # Example - /// - /// Find the start and end location of the first word with exactly 13 - /// Unicode word characters: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let text = "I categorically deny having triskaidekaphobia."; - /// let mat = Regex::new(r"\b\w{13}\b").unwrap().find(text).unwrap(); - /// assert_eq!(mat.start(), 2); - /// assert_eq!(mat.end(), 15); - /// # } - /// ``` - #[inline] - pub fn find<'t>(&self, text: &'t str) -> Option<Match<'t>> { - self.find_at(text, 0) - } - - /// Returns an iterator for each successive non-overlapping match in - /// `text`, returning the start and end byte indices with respect to - /// `text`. - /// - /// # Example - /// - /// Find the start and end location of every word with exactly 13 Unicode - /// word characters: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let text = "Retroactively relinquishing remunerations is reprehensible."; - /// for mat in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { - /// println!("{:?}", mat); - /// } - /// # } - /// ``` - #[inline] - pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { - Matches { text, it: self.meta.find_iter(text) } - } - - /// Returns the capture groups corresponding to the leftmost-first - /// match in `text`. Capture group `0` always corresponds to the entire - /// match. If no match is found, then `None` is returned. - /// - /// You should only use `captures` if you need access to the location of - /// capturing group matches. Otherwise, `find` is faster for discovering - /// the location of the overall match. - /// - /// # Examples - /// - /// Say you have some text with movie names and their release years, - /// like "'Citizen Kane' (1941)". It'd be nice if we could search for text - /// looking like that, while also extracting the movie name and its release - /// year separately. - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); - /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); - /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); - /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); - /// // You can also access the groups by index using the Index notation. - /// // Note that this will panic on an invalid index. - /// assert_eq!(&caps[1], "Citizen Kane"); - /// assert_eq!(&caps[2], "1941"); - /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); - /// # } - /// ``` - /// - /// Note that the full match is at capture group `0`. Each subsequent - /// capture group is indexed by the order of its opening `(`. - /// - /// We can make this example a bit clearer by using *named* capture groups: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") - /// .unwrap(); - /// let text = "Not my favorite movie: 'Citizen Kane' (1941)."; - /// let caps = re.captures(text).unwrap(); - /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane"); - /// assert_eq!(caps.name("year").unwrap().as_str(), "1941"); - /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); - /// // You can also access the groups by name using the Index notation. - /// // Note that this will panic on an invalid group name. - /// assert_eq!(&caps["title"], "Citizen Kane"); - /// assert_eq!(&caps["year"], "1941"); - /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); - /// - /// # } - /// ``` - /// - /// Here we name the capture groups, which we can access with the `name` - /// method or the `Index` notation with a `&str`. Note that the named - /// capture groups are still accessible with `get` or the `Index` notation - /// with a `usize`. - /// - /// The `0`th capture group is always unnamed, so it must always be - /// accessed with `get(0)` or `[0]`. - #[inline] - pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { - self.captures_at(text, 0) - } - - /// Returns an iterator over all the non-overlapping capture groups matched - /// in `text`. This is operationally the same as `find_iter`, except it - /// yields information about capturing group matches. - /// - /// # Example - /// - /// We can use this to find all movie titles and their release years in - /// some text, where the movie is formatted like "'Title' (xxxx)": - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new(r"'(?P<title>[^']+)'\s+\((?P<year>\d{4})\)") - /// .unwrap(); - /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; - /// for caps in re.captures_iter(text) { - /// println!("Movie: {:?}, Released: {:?}", - /// &caps["title"], &caps["year"]); - /// } - /// // Output: - /// // Movie: Citizen Kane, Released: 1941 - /// // Movie: The Wizard of Oz, Released: 1939 - /// // Movie: M, Released: 1931 - /// # } - /// ``` - #[inline] - pub fn captures_iter<'r, 't>( - &'r self, - text: &'t str, - ) -> CaptureMatches<'r, 't> { - CaptureMatches { text, it: self.meta.captures_iter(text) } - } - - /// Returns an iterator of substrings of `text` delimited by a match of the - /// regular expression. Namely, each element of the iterator corresponds to - /// text that *isn't* matched by the regular expression. - /// - /// This method will *not* copy the text given. - /// - /// # Example - /// - /// To split a string delimited by arbitrary amounts of spaces or tabs: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new(r"[ \t]+").unwrap(); - /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); - /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); - /// # } - /// ``` - #[inline] - pub fn split<'r, 't>(&'r self, text: &'t str) -> Split<'r, 't> { - Split { text, it: self.meta.split(text) } - } - - /// Returns an iterator of at most `limit` substrings of `text` delimited - /// by a match of the regular expression. (A `limit` of `0` will return no - /// substrings.) Namely, each element of the iterator corresponds to text - /// that *isn't* matched by the regular expression. The remainder of the - /// string that is not split will be the last element in the iterator. - /// - /// This method will *not* copy the text given. - /// - /// # Example - /// - /// Get the first two words in some text: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new(r"\W+").unwrap(); - /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); - /// assert_eq!(fields, vec!("Hey", "How", "are you?")); - /// # } - /// ``` - #[inline] - pub fn splitn<'r, 't>( - &'r self, - text: &'t str, - limit: usize, - ) -> SplitN<'r, 't> { - SplitN { text, it: self.meta.splitn(text, limit) } - } - - /// Replaces the leftmost-first match with the replacement provided. - /// The replacement can be a regular string (where `$N` and `$name` are - /// expanded to match capture groups) or a function that takes the matches' - /// `Captures` and returns the replaced string. - /// - /// If no match is found, then a copy of the string is returned unchanged. - /// - /// # Replacement string syntax - /// - /// All instances of `$name` in the replacement text is replaced with the - /// corresponding capture group `name`. - /// - /// `name` may be an integer corresponding to the index of the - /// capture group (counted by order of opening parenthesis where `0` is the - /// entire match) or it can be a name (consisting of letters, digits or - /// underscores) corresponding to a named capture group. - /// - /// If `name` isn't a valid capture group (whether the name doesn't exist - /// or isn't a valid index), then it is replaced with the empty string. - /// - /// The longest possible name is used. e.g., `$1a` looks up the capture - /// group named `1a` and not the capture group at index `1`. To exert more - /// precise control over the name, use braces, e.g., `${1}a`. - /// - /// To write a literal `$` use `$$`. - /// - /// # Examples - /// - /// Note that this function is polymorphic with respect to the replacement. - /// In typical usage, this can just be a normal string: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new("[^01]+").unwrap(); - /// assert_eq!(re.replace("1078910", ""), "1010"); - /// # } - /// ``` - /// - /// But anything satisfying the `Replacer` trait will work. For example, - /// a closure of type `|&Captures| -> String` provides direct access to the - /// captures corresponding to a match. This allows one to access - /// capturing group matches easily: - /// - /// ```rust - /// # use regex::Regex; - /// # use regex::Captures; fn main() { - /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); - /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { - /// format!("{} {}", &caps[2], &caps[1]) - /// }); - /// assert_eq!(result, "Bruce Springsteen"); - /// # } - /// ``` - /// - /// But this is a bit cumbersome to use all the time. Instead, a simple - /// syntax is supported that expands `$name` into the corresponding capture - /// group. Here's the last example, but using this expansion technique - /// with named capture groups: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(?P<first>\S+)").unwrap(); - /// let result = re.replace("Springsteen, Bruce", "$first $last"); - /// assert_eq!(result, "Bruce Springsteen"); - /// # } - /// ``` - /// - /// Note that using `$2` instead of `$first` or `$1` instead of `$last` - /// would produce the same result. To write a literal `$` use `$$`. - /// - /// Sometimes the replacement string requires use of curly braces to - /// delineate a capture group replacement and surrounding literal text. - /// For example, if we wanted to join two words together with an - /// underscore: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let re = Regex::new(r"(?P<first>\w+)\s+(?P<second>\w+)").unwrap(); - /// let result = re.replace("deep fried", "${first}_$second"); - /// assert_eq!(result, "deep_fried"); - /// # } - /// ``` - /// - /// Without the curly braces, the capture group name `first_` would be - /// used, and since it doesn't exist, it would be replaced with the empty - /// string. - /// - /// Finally, sometimes you just want to replace a literal string with no - /// regard for capturing group expansion. This can be done by wrapping a - /// byte string with `NoExpand`: - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// use regex::NoExpand; - /// - /// let re = Regex::new(r"(?P<last>[^,\s]+),\s+(\S+)").unwrap(); - /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); - /// assert_eq!(result, "$2 $last"); - /// # } - /// ``` - #[inline] - pub fn replace<'t, R: Replacer>( - &self, - text: &'t str, - rep: R, - ) -> Cow<'t, str> { - self.replacen(text, 1, rep) - } - - /// Replaces all non-overlapping matches in `text` with the replacement - /// provided. This is the same as calling `replacen` with `limit` set to - /// `0`. - /// - /// See the documentation for `replace` for details on how to access - /// capturing group matches in the replacement string. - #[inline] - pub fn replace_all<'t, R: Replacer>( - &self, - text: &'t str, - rep: R, - ) -> Cow<'t, str> { - self.replacen(text, 0, rep) - } - - /// Replaces at most `limit` non-overlapping matches in `text` with the - /// replacement provided. If `limit` is 0, then all non-overlapping matches - /// are replaced. - /// - /// See the documentation for `replace` for details on how to access - /// capturing group matches in the replacement string. - pub fn replacen<'t, R: Replacer>( - &self, - text: &'t str, - limit: usize, - mut rep: R, - ) -> Cow<'t, str> { - // If we know that the replacement doesn't have any capture expansions, - // then we can use the fast path. The fast path can make a tremendous - // difference: - // - // 1) We use `find_iter` instead of `captures_iter`. Not asking for - // captures generally makes the regex engines faster. - // 2) We don't need to look up all of the capture groups and do - // replacements inside the replacement string. We just push it - // at each match and be done with it. - if let Some(rep) = rep.no_expansion() { - let mut it = self.find_iter(text).enumerate().peekable(); - if it.peek().is_none() { - return Cow::Borrowed(text); - } - let mut new = String::with_capacity(text.len()); - let mut last_match = 0; - for (i, m) in it { - new.push_str(&text[last_match..m.start()]); - new.push_str(&rep); - last_match = m.end(); - if limit > 0 && i >= limit - 1 { - break; - } - } - new.push_str(&text[last_match..]); - return Cow::Owned(new); - } - - // The slower path, which we use if the replacement needs access to - // capture groups. - let mut it = self.captures_iter(text).enumerate().peekable(); - if it.peek().is_none() { - return Cow::Borrowed(text); - } - let mut new = String::with_capacity(text.len()); - let mut last_match = 0; - for (i, cap) in it { - // unwrap on 0 is OK because captures only reports matches - let m = cap.get(0).unwrap(); - new.push_str(&text[last_match..m.start()]); - rep.replace_append(&cap, &mut new); - last_match = m.end(); - if limit > 0 && i >= limit - 1 { - break; - } - } - new.push_str(&text[last_match..]); - Cow::Owned(new) - } -} - -/// Advanced or "lower level" search methods. -impl Regex { - /// Returns the end location of a match in the text given. - /// - /// This method may have the same performance characteristics as - /// `is_match`, except it provides an end location for a match. In - /// particular, the location returned *may be shorter* than the proper end - /// of the leftmost-first match that you would find via `Regex::find`. - /// - /// Note that it is not guaranteed that this routine finds the shortest or - /// "earliest" possible match. Instead, the main idea of this API is that - /// it returns the offset at the point at which the internal regex engine - /// has determined that a match has occurred. This may vary depending on - /// which internal regex engine is used, and thus, the offset itself may - /// change. - /// - /// # Example - /// - /// Typically, `a+` would match the entire first sequence of `a` in some - /// text, but `shortest_match` can give up as soon as it sees the first - /// `a`. - /// - /// ```rust - /// # use regex::Regex; - /// # fn main() { - /// let text = "aaaaa"; - /// let pos = Regex::new(r"a+").unwrap().shortest_match(text); - /// assert_eq!(pos, Some(1)); - /// # } - /// ``` - #[inline] - pub fn shortest_match(&self, text: &str) -> Option<usize> { - self.shortest_match_at(text, 0) - } - - /// Returns the same as `shortest_match`, but starts the search at the - /// given offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only match - /// when `start == 0`. - #[inline] - pub fn shortest_match_at( - &self, - text: &str, - start: usize, - ) -> Option<usize> { - let mut input = Input::new(text).earliest(true); - input.set_start(start); - self.meta.search_half(&input).map(|hm| hm.offset()) - } - - /// Returns the same as is_match, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn is_match_at(&self, text: &str, start: usize) -> bool { - let mut input = Input::new(text); - input.set_start(start); - self.meta.is_match(input) - } - - /// Returns the same as find, but starts the search at the given - /// offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn find_at<'t>( - &self, - text: &'t str, - start: usize, - ) -> Option<Match<'t>> { - let mut input = Input::new(text); - input.set_start(start); - self.meta.find(input).map(|m| Match::new(text, m.start(), m.end())) - } - - /// Returns the same as [`Regex::captures`], but starts the search at the - /// given offset. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn captures_at<'t>( - &self, - text: &'t str, - start: usize, - ) -> Option<Captures<'t>> { - let mut caps = self.meta.create_captures(); - let mut input = Input::new(text); - input.set_start(start); - self.meta.captures(input, &mut caps); - if caps.is_match() { - Some(Captures { text, caps }) - } else { - None - } - } - - /// This is like `captures`, but uses - /// [`CaptureLocations`](struct.CaptureLocations.html) - /// instead of - /// [`Captures`](struct.Captures.html) in order to amortize allocations. - /// - /// To create a `CaptureLocations` value, use the - /// `Regex::capture_locations` method. - /// - /// This returns the overall match if this was successful, which is always - /// equivalence to the `0`th capture group. - #[inline] - pub fn captures_read<'t>( - &self, - locs: &mut CaptureLocations, - text: &'t str, - ) -> Option<Match<'t>> { - self.captures_read_at(locs, text, 0) - } - - /// Returns the same as captures, but starts the search at the given - /// offset and populates the capture locations given. - /// - /// The significance of the starting point is that it takes the surrounding - /// context into consideration. For example, the `\A` anchor can only - /// match when `start == 0`. - #[inline] - pub fn captures_read_at<'t>( - &self, - locs: &mut CaptureLocations, - text: &'t str, - start: usize, - ) -> Option<Match<'t>> { - let mut input = Input::new(text); - input.set_start(start); - self.meta.captures(input, &mut locs.0); - locs.0.get_match().map(|m| Match::new(text, m.start(), m.end())) - } - - /// An undocumented alias for `captures_read_at`. - /// - /// The `regex-capi` crate previously used this routine, so to avoid - /// breaking that crate, we continue to provide the name as an undocumented - /// alias. - #[doc(hidden)] - #[inline] - pub fn read_captures_at<'t>( - &self, - locs: &mut CaptureLocations, - text: &'t str, - start: usize, - ) -> Option<Match<'t>> { - self.captures_read_at(locs, text, start) - } -} - -/// Auxiliary methods. -impl Regex { - /// Returns the original string of this regex. - #[inline] - pub fn as_str(&self) -> &str { - &self.pattern - } - - /// Returns an iterator over the capture names. - pub fn capture_names(&self) -> CaptureNames<'_> { - CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO)) - } - - /// Returns the number of captures. - pub fn captures_len(&self) -> usize { - self.meta.group_info().group_len(PatternID::ZERO) - } - - /// Returns the total number of capturing groups that appear in every - /// possible match. - /// - /// If the number of capture groups can vary depending on the match, then - /// this returns `None`. That is, a value is only returned when the number - /// of matching groups is invariant or "static." - /// - /// Note that like [`Regex::captures_len`], this **does** include the - /// implicit capturing group corresponding to the entire match. Therefore, - /// when a non-None value is returned, it is guaranteed to be at least `1`. - /// Stated differently, a return value of `Some(0)` is impossible. - /// - /// # Example - /// - /// This shows a few cases where a static number of capture groups is - /// available and a few cases where it is not. - /// - /// ``` - /// use regex::Regex; - /// - /// let len = |pattern| { - /// Regex::new(pattern).map(|re| re.static_captures_len()) - /// }; - /// - /// assert_eq!(Some(1), len("a")?); - /// assert_eq!(Some(2), len("(a)")?); - /// assert_eq!(Some(2), len("(a)|(b)")?); - /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); - /// assert_eq!(None, len("(a)|b")?); - /// assert_eq!(None, len("a|(b)")?); - /// assert_eq!(None, len("(b)*")?); - /// assert_eq!(Some(2), len("(b)+")?); - /// - /// # Ok::<(), Box<dyn std::error::Error>>(()) - /// ``` - #[inline] - pub fn static_captures_len(&self) -> Option<usize> { - self.meta.static_captures_len() - } - - /// Returns an empty set of capture locations that can be reused in - /// multiple calls to `captures_read` or `captures_read_at`. - #[inline] - pub fn capture_locations(&self) -> CaptureLocations { - CaptureLocations(self.meta.create_captures()) - } - - /// An alias for `capture_locations` to preserve backward compatibility. - /// - /// The `regex-capi` crate uses this method, so to avoid breaking that - /// crate, we continue to export it as an undocumented API. - #[doc(hidden)] - #[inline] - pub fn locations(&self) -> CaptureLocations { - self.capture_locations() - } -} - -/// An iterator over the names of all possible captures. -/// -/// `None` indicates an unnamed capture; the first element (capture 0, the -/// whole matched region) is always unnamed. -/// -/// `'r` is the lifetime of the compiled regular expression. -#[derive(Clone, Debug)] -pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>); - -impl<'r> Iterator for CaptureNames<'r> { - type Item = Option<&'r str>; - - #[inline] - fn next(&mut self) -> Option<Option<&'r str>> { - self.0.next() - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.0.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.0.count() - } -} - -impl<'r> ExactSizeIterator for CaptureNames<'r> {} - -impl<'r> FusedIterator for CaptureNames<'r> {} - -/// Yields all substrings delimited by a regular expression match. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the string being split. -#[derive(Debug)] -pub struct Split<'r, 't> { - text: &'t str, - it: meta::Split<'r, 't>, -} - -impl<'r, 't> Iterator for Split<'r, 't> { - type Item = &'t str; - - #[inline] - fn next(&mut self) -> Option<&'t str> { - self.it.next().map(|span| &self.text[span]) - } -} - -impl<'r, 't> FusedIterator for Split<'r, 't> {} - -/// Yields at most `N` substrings delimited by a regular expression match. -/// -/// The last substring will be whatever remains after splitting. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the string being split. -#[derive(Debug)] -pub struct SplitN<'r, 't> { - text: &'t str, - it: meta::SplitN<'r, 't>, -} - -impl<'r, 't> Iterator for SplitN<'r, 't> { - type Item = &'t str; - - #[inline] - fn next(&mut self) -> Option<&'t str> { - self.it.next().map(|span| &self.text[span]) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.it.size_hint() - } -} - -impl<'r, 't> FusedIterator for SplitN<'r, 't> {} - -/// Match represents a single match of a regex in a haystack. -/// -/// The lifetime parameter `'t` refers to the lifetime of the matched text. -#[derive(Copy, Clone, Eq, PartialEq)] -pub struct Match<'t> { - text: &'t str, - start: usize, - end: usize, -} - -impl<'t> Match<'t> { - /// Returns the starting byte offset of the match in the haystack. - #[inline] - pub fn start(&self) -> usize { - self.start - } - - /// Returns the ending byte offset of the match in the haystack. - #[inline] - pub fn end(&self) -> usize { - self.end - } - - /// Returns true if and only if this match has a length of zero. - #[inline] - pub fn is_empty(&self) -> bool { - self.start == self.end - } - - /// Returns the length, in bytes, of this match. - #[inline] - pub fn len(&self) -> usize { - self.end - self.start - } - - /// Returns the range over the starting and ending byte offsets of the - /// match in the haystack. - #[inline] - pub fn range(&self) -> Range<usize> { - self.start..self.end - } - - /// Returns the matched text. - #[inline] - pub fn as_str(&self) -> &'t str { - &self.text[self.range()] - } - - /// Creates a new match from the given haystack and byte offsets. - #[inline] - fn new(haystack: &'t str, start: usize, end: usize) -> Match<'t> { - Match { text: haystack, start, end } - } -} - -impl<'t> std::fmt::Debug for Match<'t> { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_struct("Match") - .field("start", &self.start) - .field("end", &self.end) - .field("string", &self.as_str()) - .finish() - } -} - -impl<'t> From<Match<'t>> for &'t str { - fn from(m: Match<'t>) -> &'t str { - m.as_str() - } -} - -impl<'t> From<Match<'t>> for Range<usize> { - fn from(m: Match<'t>) -> Range<usize> { - m.range() - } -} - -/// CaptureLocations is a low level representation of the raw offsets of each -/// submatch. -/// -/// You can think of this as a lower level [`Captures`], where this type does -/// not support named capturing groups directly and it does not borrow the text -/// that these offsets were matched on. -/// -/// Primarily, this type is useful when using the lower level `Regex` APIs -/// such as `read_captures`, which permits amortizing the allocation in which -/// capture match locations are stored. -/// -/// In order to build a value of this type, you'll need to call the -/// `capture_locations` method on the `Regex` being used to execute the search. -/// The value returned can then be reused in subsequent searches. -/// -/// # Example -/// -/// This example shows how to create and use `CaptureLocations` in a search. -/// -/// ``` -/// use regex::Regex; -/// -/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); -/// let mut locs = re.capture_locations(); -/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); -/// assert_eq!(0..17, m.range()); -/// assert_eq!(Some((0, 17)), locs.get(0)); -/// assert_eq!(Some((0, 5)), locs.get(1)); -/// assert_eq!(Some((6, 17)), locs.get(2)); -/// -/// // Asking for an invalid capture group always returns None. -/// assert_eq!(None, locs.get(3)); -/// assert_eq!(None, locs.get(34973498648)); -/// assert_eq!(None, locs.get(9944060567225171988)); -/// ``` -#[derive(Clone, Debug)] -pub struct CaptureLocations(captures::Captures); - -/// A type alias for `CaptureLocations` for backwards compatibility. -/// -/// Previously, we exported `CaptureLocations` as `Locations` in an -/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), -/// we continue re-exporting the same undocumented API. -#[doc(hidden)] -pub type Locations = CaptureLocations; - -impl CaptureLocations { - /// Returns the start and end positions of the Nth capture group. Returns - /// `None` if `i` is not a valid capture group or if the capture group did - /// not match anything. The positions returned are *always* byte indices - /// with respect to the original string matched. - #[inline] - pub fn get(&self, i: usize) -> Option<(usize, usize)> { - self.0.get_group(i).map(|sp| (sp.start, sp.end)) - } - - /// Returns the total number of capture groups (even if they didn't match). - /// - /// This is always at least `1` since every regex has at least `1` - /// capturing group that corresponds to the entire match. - #[inline] - pub fn len(&self) -> usize { - self.0.group_len() - } - - /// An alias for the `get` method for backwards compatibility. - /// - /// Previously, we exported `get` as `pos` in an undocumented API. To - /// prevent breaking that code (e.g., in `regex-capi`), we continue - /// re-exporting the same undocumented API. - #[doc(hidden)] - #[inline] - pub fn pos(&self, i: usize) -> Option<(usize, usize)> { - self.get(i) - } -} - -/// Captures represents a group of captured strings for a single match. -/// -/// The 0th capture always corresponds to the entire match. Each subsequent -/// index corresponds to the next capture group in the regex. If a capture -/// group is named, then the matched string is *also* available via the `name` -/// method. (Note that the 0th capture is always unnamed and so must be -/// accessed with the `get` method.) -/// -/// Positions returned from a capture group are always byte indices. -/// -/// `'t` is the lifetime of the matched text. -pub struct Captures<'t> { - text: &'t str, - caps: captures::Captures, -} - -impl<'t> Captures<'t> { - /// Returns the match associated with the capture group at index `i`. If - /// `i` does not correspond to a capture group, or if the capture group - /// did not participate in the match, then `None` is returned. - /// - /// # Examples - /// - /// Get the text of the match with a default of an empty string if this - /// group didn't participate in the match: - /// - /// ```rust - /// # use regex::Regex; - /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); - /// let caps = re.captures("abc123").unwrap(); - /// - /// let text1 = caps.get(1).map_or("", |m| m.as_str()); - /// let text2 = caps.get(2).map_or("", |m| m.as_str()); - /// assert_eq!(text1, "123"); - /// assert_eq!(text2, ""); - /// ``` - #[inline] - pub fn get(&self, i: usize) -> Option<Match<'t>> { - self.caps - .get_group(i) - .map(|sp| Match::new(self.text, sp.start, sp.end)) - } - - /// Returns the match for the capture group named `name`. If `name` isn't a - /// valid capture group or didn't match anything, then `None` is returned. - #[inline] - pub fn name(&self, name: &str) -> Option<Match<'t>> { - self.caps - .get_group_by_name(name) - .map(|sp| Match::new(self.text, sp.start, sp.end)) - } - - /// An iterator that yields all capturing matches in the order in which - /// they appear in the regex. If a particular capture group didn't - /// participate in the match, then `None` is yielded for that capture. - /// - /// The first match always corresponds to the overall match of the regex. - #[inline] - pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { - SubCaptureMatches { text: self.text, it: self.caps.iter() } - } - - /// Expands all instances of `$name` in `replacement` to the corresponding - /// capture group `name`, and writes them to the `dst` buffer given. - /// - /// `name` may be an integer corresponding to the index of the capture - /// group (counted by order of opening parenthesis where `0` is the - /// entire match) or it can be a name (consisting of letters, digits or - /// underscores) corresponding to a named capture group. - /// - /// If `name` isn't a valid capture group (whether the name doesn't exist - /// or isn't a valid index), then it is replaced with the empty string. - /// - /// The longest possible name consisting of the characters `[_0-9A-Za-z]` - /// is used. e.g., `$1a` looks up the capture group named `1a` and not the - /// capture group at index `1`. To exert more precise control over the - /// name, or to refer to a capture group name that uses characters outside - /// of `[_0-9A-Za-z]`, use braces, e.g., `${1}a` or `${foo[bar].baz}`. When - /// using braces, any sequence of characters is permitted. If the sequence - /// does not refer to a capture group name in the corresponding regex, then - /// it is replaced with an empty string. - /// - /// To write a literal `$` use `$$`. - #[inline] - pub fn expand(&self, replacement: &str, dst: &mut String) { - self.caps.interpolate_string_into(self.text, replacement, dst); - } - - /// Returns the total number of capture groups (even if they didn't match). - /// - /// This is always at least `1`, since every regex has at least one capture - /// group that corresponds to the full match. - #[inline] - pub fn len(&self) -> usize { - self.caps.group_len() - } -} - -impl<'t> fmt::Debug for Captures<'t> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_tuple("Captures").field(&self.caps).finish() - } -} - -/// Get a group by index. -/// -/// `'t` is the lifetime of the matched text. -/// -/// The text can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `get()` instead. -/// -/// # Panics -/// -/// If there is no group at the given index. -impl<'t> Index<usize> for Captures<'t> { - type Output = str; - - fn index(&self, i: usize) -> &str { - self.get(i) - .map(|m| m.as_str()) - .unwrap_or_else(|| panic!("no group at index '{}'", i)) - } -} - -/// Get a group by name. -/// -/// `'t` is the lifetime of the matched text and `'i` is the lifetime -/// of the group name (the index). -/// -/// The text can't outlive the `Captures` object if this method is -/// used, because of how `Index` is defined (normally `a[i]` is part -/// of `a` and can't outlive it); to do that, use `name` instead. -/// -/// # Panics -/// -/// If there is no group named by the given value. -impl<'t, 'i> Index<&'i str> for Captures<'t> { - type Output = str; - - fn index<'a>(&'a self, name: &'i str) -> &'a str { - self.name(name) - .map(|m| m.as_str()) - .unwrap_or_else(|| panic!("no group named '{}'", name)) - } -} - -/// An iterator that yields all capturing matches in the order in which they -/// appear in the regex. -/// -/// If a particular capture group didn't participate in the match, then `None` -/// is yielded for that capture. The first match always corresponds to the -/// overall match of the regex. -/// -/// The lifetime `'c` corresponds to the lifetime of the `Captures` value, and -/// the lifetime `'t` corresponds to the originally matched text. -#[derive(Clone, Debug)] -pub struct SubCaptureMatches<'c, 't> { - text: &'t str, - it: captures::CapturesPatternIter<'c>, -} - -impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { - type Item = Option<Match<'t>>; - - #[inline] - fn next(&mut self) -> Option<Option<Match<'t>>> { - self.it.next().map(|group| { - group.map(|sp| Match::new(self.text, sp.start, sp.end)) - }) - } - - #[inline] - fn size_hint(&self) -> (usize, Option<usize>) { - self.it.size_hint() - } - - #[inline] - fn count(self) -> usize { - self.it.count() - } -} - -impl<'c, 't> ExactSizeIterator for SubCaptureMatches<'c, 't> {} - -impl<'c, 't> FusedIterator for SubCaptureMatches<'c, 't> {} - -/// An iterator that yields all non-overlapping capture groups matching a -/// particular regular expression. -/// -/// The iterator stops when no more matches can be found. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the matched string. -#[derive(Debug)] -pub struct CaptureMatches<'r, 't> { - text: &'t str, - it: meta::CapturesMatches<'r, 't>, -} - -impl<'r, 't> Iterator for CaptureMatches<'r, 't> { - type Item = Captures<'t>; - - #[inline] - fn next(&mut self) -> Option<Captures<'t>> { - self.it.next().map(|caps| Captures { text: self.text, caps }) - } - - #[inline] - fn count(self) -> usize { - self.it.count() - } -} - -impl<'r, 't> FusedIterator for CaptureMatches<'r, 't> {} - -/// An iterator over all non-overlapping matches for a particular string. -/// -/// The iterator yields a `Match` value. The iterator stops when no more -/// matches can be found. -/// -/// `'r` is the lifetime of the compiled regular expression and `'t` is the -/// lifetime of the matched string. -#[derive(Debug)] -pub struct Matches<'r, 't> { - text: &'t str, - it: meta::FindMatches<'r, 't>, -} - -impl<'r, 't> Iterator for Matches<'r, 't> { - type Item = Match<'t>; - - #[inline] - fn next(&mut self) -> Option<Match<'t>> { - self.it.next().map(|sp| Match::new(self.text, sp.start(), sp.end())) - } - - #[inline] - fn count(self) -> usize { - self.it.count() - } -} - -impl<'r, 't> FusedIterator for Matches<'r, 't> {} - -/// Replacer describes types that can be used to replace matches in a string. -/// -/// In general, users of this crate shouldn't need to implement this trait, -/// since implementations are already provided for `&str` along with other -/// variants of string types and `FnMut(&Captures) -> String` (or any -/// `FnMut(&Captures) -> T` where `T: AsRef<str>`), which covers most use cases. -pub trait Replacer { - /// Appends text to `dst` to replace the current match. - /// - /// The current match is represented by `caps`, which is guaranteed to - /// have a match at capture group `0`. - /// - /// For example, a no-op replacement would be - /// `dst.push_str(caps.get(0).unwrap().as_str())`. - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); - - /// Return a fixed unchanging replacement string. - /// - /// When doing replacements, if access to `Captures` is not needed (e.g., - /// the replacement byte string does not need `$` expansion), then it can - /// be beneficial to avoid finding sub-captures. - /// - /// In general, this is called once for every call to `replacen`. - fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> { - None - } - - /// Return a `Replacer` that borrows and wraps this `Replacer`. - /// - /// This is useful when you want to take a generic `Replacer` (which might - /// not be cloneable) and use it without consuming it, so it can be used - /// more than once. - /// - /// # Example - /// - /// ``` - /// use regex::{Regex, Replacer}; - /// - /// fn replace_all_twice<R: Replacer>( - /// re: Regex, - /// src: &str, - /// mut rep: R, - /// ) -> String { - /// let dst = re.replace_all(src, rep.by_ref()); - /// let dst = re.replace_all(&dst, rep.by_ref()); - /// dst.into_owned() - /// } - /// ``` - fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { - ReplacerRef(self) - } -} - -/// By-reference adaptor for a `Replacer` -/// -/// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). -#[derive(Debug)] -pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); - -impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - self.0.replace_append(caps, dst) - } - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - self.0.no_expansion() - } -} - -impl<'a> Replacer for &'a str { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - caps.expand(*self, dst); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - no_expansion(self) - } -} - -impl<'a> Replacer for &'a String { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - self.as_str().replace_append(caps, dst) - } - - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - no_expansion(self) - } -} - -impl Replacer for String { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - self.as_str().replace_append(caps, dst) - } - - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - no_expansion(self) - } -} - -impl<'a> Replacer for Cow<'a, str> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - self.as_ref().replace_append(caps, dst) - } - - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - no_expansion(self) - } -} - -impl<'a> Replacer for &'a Cow<'a, str> { - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - self.as_ref().replace_append(caps, dst) - } - - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - no_expansion(self) - } -} - -fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { - let s = t.as_ref(); - match find_byte(b'$', s.as_bytes()) { - Some(_) => None, - None => Some(Cow::Borrowed(s)), - } -} - -impl<F, T> Replacer for F -where - F: FnMut(&Captures<'_>) -> T, - T: AsRef<str>, -{ - fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { - dst.push_str((*self)(caps).as_ref()); - } -} - -/// `NoExpand` indicates literal string replacement. -/// -/// It can be used with `replace` and `replace_all` to do a literal string -/// replacement without expanding `$name` to their corresponding capture -/// groups. This can be both convenient (to avoid escaping `$`, for example) -/// and performant (since capture groups don't need to be found). -/// -/// `'t` is the lifetime of the literal text. -#[derive(Clone, Debug)] -pub struct NoExpand<'t>(pub &'t str); - -impl<'t> Replacer for NoExpand<'t> { - fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { - dst.push_str(self.0); - } - - fn no_expansion(&mut self) -> Option<Cow<'_, str>> { - Some(Cow::Borrowed(self.0)) - } -} diff --git a/src/regex/bytes.rs b/src/regex/bytes.rs new file mode 100644 index 0000000000..fc4238bcd9 --- /dev/null +++ b/src/regex/bytes.rs @@ -0,0 +1,2579 @@ +use alloc::{borrow::Cow, sync::Arc, vec::Vec}; + +use regex_automata::{meta, util::captures, Input, PatternID}; + +use crate::{bytes::RegexBuilder, error::Error}; + +/// A compiled regular expression for searching Unicode haystacks. +/// +/// A `Regex` can be used to search haystacks, split haystacks into substrings +/// or replace substrings in a haystack with a different substring. All +/// searching is done with an implicit `(?s:.)*?` at the beginning and end of +/// an pattern. To force an expression to match the whole string (or a prefix +/// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`). +/// +/// Like the `Regex` type in the parent module, matches with this regex return +/// byte offsets into the haystack. **Unlike** the parent `Regex` type, these +/// byte offsets may not correspond to UTF-8 sequence boundaries since the +/// regexes in this module can match arbitrary bytes. +/// +/// The only methods that allocate new byte strings are the string replacement +/// methods. All other methods (searching and splitting) return borrowed +/// references into the haystack given. +/// +/// # Example +/// +/// Find the offsets of a US phone number: +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); +/// let m = re.find(b"phone: 111-222-3333").unwrap(); +/// assert_eq!(7..19, m.range()); +/// ``` +/// +/// # Example: extracting capture groups +/// +/// A common way to use regexes is with capture groups. That is, instead of +/// just looking for matches of an entire regex, parentheses are used to create +/// groups that represent part of the match. +/// +/// For example, consider a haystack with multiple lines, and each line has +/// three whitespace delimited fields where the second field is expected to be +/// a number and the third field a boolean. To make this convenient, we use +/// the [`Captures::extract`] API to put the strings that match each group +/// into a fixed size array: +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let hay = b" +/// rabbit 54 true +/// groundhog 2 true +/// does not match +/// fox 109 false +/// "; +/// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap(); +/// let mut fields: Vec<(&[u8], i64, bool)> = vec![]; +/// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) { +/// // These unwraps are OK because our pattern is written in a way where +/// // all matches for f2 and f3 will be valid UTF-8. +/// let f2 = std::str::from_utf8(f2).unwrap(); +/// let f3 = std::str::from_utf8(f3).unwrap(); +/// fields.push((f1, f2.parse()?, f3.parse()?)); +/// } +/// assert_eq!(fields, vec![ +/// (&b"rabbit"[..], 54, true), +/// (&b"groundhog"[..], 2, true), +/// (&b"fox"[..], 109, false), +/// ]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: matching invalid UTF-8 +/// +/// One of the reasons for searching `&[u8]` haystacks is that the `&[u8]` +/// might not be valid UTF-8. Indeed, with a `bytes::Regex`, patterns that +/// match invalid UTF-8 are explicitly allowed. Here's one example that looks +/// for valid UTF-8 fields that might be separated by invalid UTF-8. In this +/// case, we use `(?s-u:.)`, which matches any byte. Attempting to use it in a +/// top-level `Regex` will result in the regex failing to compile. Notice also +/// that we use `.` with Unicode mode enabled, in which case, only valid UTF-8 +/// is matched. In this way, we can build one pattern where some parts only +/// match valid UTF-8 while other parts are more permissive. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// // F0 9F 92 A9 is the UTF-8 encoding for a Pile of Poo. +/// let hay = b"\xFF\xFFfoo\xFF\xFF\xFF\xF0\x9F\x92\xA9\xFF"; +/// // An equivalent to '(?s-u:.)' is '(?-u:[\x00-\xFF])'. +/// let re = Regex::new(r"(?s)(?-u:.)*?(?<f1>.+)(?-u:.)*?(?<f2>.+)").unwrap(); +/// let caps = re.captures(hay).unwrap(); +/// assert_eq!(&caps["f1"], &b"foo"[..]); +/// assert_eq!(&caps["f2"], "💩".as_bytes()); +/// ``` +#[derive(Clone)] +pub struct Regex { + pub(crate) meta: meta::Regex, + pub(crate) pattern: Arc<str>, +} + +impl core::fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl core::fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_tuple("Regex").field(&self.as_str()).finish() + } +} + +impl core::str::FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result<Regex, Error> { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace substrings in a haystack. + /// + /// Note that regex compilation tends to be a somewhat expensive process, + /// and unlike higher level environments, compilation is not automatically + /// cached for you. One should endeavor to compile a regex once and then + /// reuse it. For example, it's a bad idea to compile the same regex + /// repeatedly in a loop. + /// + /// # Errors + /// + /// If an invalid pattern is given, then an error is returned. + /// An error is also returned if the pattern is valid, but would + /// produce a regex that is bigger than the configured size limit via + /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by + /// default.) + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// // An Invalid pattern because of an unclosed parenthesis + /// assert!(Regex::new(r"foo(bar").is_err()); + /// // An invalid pattern because the regex would be too big + /// // because Unicode tends to inflate things. + /// assert!(Regex::new(r"\w{1000}").is_err()); + /// // Disabling Unicode can make the regex much smaller, + /// // potentially by up to or more than an order of magnitude. + /// assert!(Regex::new(r"(?-u:\w){1000}").is_ok()); + /// ``` + pub fn new(re: &str) -> Result<Regex, Error> { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex anywhere + /// in the haystack given. + /// + /// It is recommended to use this method if all you need to do is test + /// whether a match exists, since the underlying matching engine may be + /// able to do less work. + /// + /// # Example + /// + /// Test if some haystack contains at least one word with exactly 13 + /// Unicode word characters: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = b"I categorically deny having triskaidekaphobia."; + /// assert!(re.is_match(hay)); + /// ``` + #[inline] + pub fn is_match(&self, haystack: &[u8]) -> bool { + self.is_match_at(haystack, 0) + } + + /// This routine searches for the first match of this regex in the + /// haystack given, and if found, returns a [`Match`]. The `Match` + /// provides access to both the byte offsets of the match and the actual + /// substring that matched. + /// + /// Note that this should only be used if you want to find the entire + /// match. If instead you just want to test the existence of a match, + /// it's potentially faster to use `Regex::is_match(hay)` instead of + /// `Regex::find(hay).is_some()`. + /// + /// # Example + /// + /// Find the first word with exactly 13 Unicode word characters: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = b"I categorically deny having triskaidekaphobia."; + /// let mat = re.find(hay).unwrap(); + /// assert_eq!(2..15, mat.range()); + /// assert_eq!(b"categorically", mat.as_bytes()); + /// ``` + #[inline] + pub fn find<'h>(&self, haystack: &'h [u8]) -> Option<Match<'h>> { + self.find_at(haystack, 0) + } + + /// Returns an iterator that yields successive non-overlapping matches in + /// the given haystack. The iterator yields values of type [`Match`]. + /// + /// # Time complexity + /// + /// Note that since `find_iter` runs potentially many searches on the + /// haystack and since each search has worst case `O(m * n)` time + /// complexity, the overall worst case time complexity for iteration is + /// `O(m * n^2)`. + /// + /// # Example + /// + /// Find every word with exactly 13 Unicode word characters: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = b"Retroactively relinquishing remunerations is reprehensible."; + /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_bytes()).collect(); + /// assert_eq!(matches, vec![ + /// &b"Retroactively"[..], + /// &b"relinquishing"[..], + /// &b"remunerations"[..], + /// &b"reprehensible"[..], + /// ]); + /// ``` + #[inline] + pub fn find_iter<'r, 'h>(&'r self, haystack: &'h [u8]) -> Matches<'r, 'h> { + Matches { haystack, it: self.meta.find_iter(haystack) } + } + + /// This routine searches for the first match of this regex in the haystack + /// given, and if found, returns not only the overall match but also the + /// matches of each capture group in the regex. If no match is found, then + /// `None` is returned. + /// + /// Capture group `0` always corresponds to an implicit unnamed group that + /// includes the entire match. If a match is found, this group is always + /// present. Subsequent groups may be named and are numbered, starting + /// at 1, by the order in which the opening parenthesis appears in the + /// pattern. For example, in the pattern `(?<a>.(?<b>.))(?<c>.)`, `a`, + /// `b` and `c` correspond to capture group indices `1`, `2` and `3`, + /// respectively. + /// + /// You should only use `captures` if you need access to the capture group + /// matches. Otherwise, [`Regex::find`] is generally faster for discovering + /// just the overall match. + /// + /// # Example + /// + /// Say you have some haystack with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for + /// strings looking like that, while also extracting the movie name and its + /// release year separately. The example below shows how to do that. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(hay).unwrap(); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)"); + /// assert_eq!(caps.get(1).unwrap().as_bytes(), b"Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_bytes(), b"1941"); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. In this case, these + /// // accesses are always correct because the overall regex will only + /// // match when these capture groups match. + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// assert_eq!(&caps[1], b"Citizen Kane"); + /// assert_eq!(&caps[2], b"1941"); + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>\d{4})\)").unwrap(); + /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(hay).unwrap(); + /// assert_eq!(caps.get(0).unwrap().as_bytes(), b"'Citizen Kane' (1941)"); + /// assert_eq!(caps.name("title").unwrap().as_bytes(), b"Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_bytes(), b"1941"); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. In this case, + /// // these accesses are always correct because the overall regex will + /// // only match when these capture groups match. + /// assert_eq!(&caps[0], b"'Citizen Kane' (1941)"); + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + /// + /// Finally, one other way to to get the matched substrings is with the + /// [`Captures::extract`] API: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let hay = b"Not my favorite movie: 'Citizen Kane' (1941)."; + /// let (full, [title, year]) = re.captures(hay).unwrap().extract(); + /// assert_eq!(full, b"'Citizen Kane' (1941)"); + /// assert_eq!(title, b"Citizen Kane"); + /// assert_eq!(year, b"1941"); + /// ``` + #[inline] + pub fn captures<'h>(&self, haystack: &'h [u8]) -> Option<Captures<'h>> { + self.captures_at(haystack, 0) + } + + /// Returns an iterator that yields successive non-overlapping matches in + /// the given haystack. The iterator yields values of type [`Captures`]. + /// + /// This is the same as [`Regex::find_iter`], but instead of only providing + /// access to the overall match, each value yield includes access to the + /// matches of all capture groups in the regex. Reporting this extra match + /// data is potentially costly, so callers should only use `captures_iter` + /// over `find_iter` when they actually need access to the capture group + /// matches. + /// + /// # Time complexity + /// + /// Note that since `captures_iter` runs potentially many searches on the + /// haystack and since each search has worst case `O(m * n)` time + /// complexity, the overall worst case time complexity for iteration is + /// `O(m * n^2)`. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some haystack, where the movie is formatted like "'Title' (xxxx)": + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap(); + /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// let mut movies = vec![]; + /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) { + /// // OK because [0-9]{4} can only match valid UTF-8. + /// let year = std::str::from_utf8(year).unwrap(); + /// movies.push((title, year.parse::<i64>()?)); + /// } + /// assert_eq!(movies, vec![ + /// (&b"Citizen Kane"[..], 1941), + /// (&b"The Wizard of Oz"[..], 1939), + /// (&b"M"[..], 1931), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Or with named groups: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap(); + /// let hay = b"'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// let mut it = re.captures_iter(hay); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], b"Citizen Kane"); + /// assert_eq!(&caps["year"], b"1941"); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], b"The Wizard of Oz"); + /// assert_eq!(&caps["year"], b"1939"); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], b"M"); + /// assert_eq!(&caps["year"], b"1931"); + /// ``` + #[inline] + pub fn captures_iter<'r, 'h>( + &'r self, + haystack: &'h [u8], + ) -> CaptureMatches<'r, 'h> { + CaptureMatches { haystack, it: self.meta.captures_iter(haystack) } + } + + /// Returns an iterator of substrings of the haystack given, delimited by a + /// match of the regex. Namely, each element of the iterator corresponds to + /// a part of the haystack that *isn't* matched by the regular expression. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let hay = b"a b \t c\td e"; + /// let fields: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(fields, vec![ + /// &b"a"[..], &b"b"[..], &b"c"[..], &b"d"[..], &b"e"[..], + /// ]); + /// ``` + /// + /// # Example: more cases + /// + /// Basic usage: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = b"Mary had a little lamb"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![ + /// &b"Mary"[..], &b"had"[..], &b"a"[..], &b"little"[..], &b"lamb"[..], + /// ]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b""; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![&b""[..]]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b"lionXXtigerXleopard"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![ + /// &b"lion"[..], &b""[..], &b"tiger"[..], &b"leopard"[..], + /// ]); + /// + /// let re = Regex::new(r"::").unwrap(); + /// let hay = b"lion::tiger::leopard"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![&b"lion"[..], &b"tiger"[..], &b"leopard"[..]]); + /// ``` + /// + /// If a haystack contains multiple contiguous matches, you will end up + /// with empty spans yielded by the iterator: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b"XXXXaXXbXc"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![ + /// &b""[..], &b""[..], &b""[..], &b""[..], + /// &b"a"[..], &b""[..], &b"b"[..], &b"c"[..], + /// ]); + /// + /// let re = Regex::new(r"/").unwrap(); + /// let hay = b"(///)"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![&b"("[..], &b""[..], &b""[..], &b")"[..]]); + /// ``` + /// + /// Separators at the start or end of a haystack are neighbored by empty + /// substring. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"0").unwrap(); + /// let hay = b"010"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![&b""[..], &b"1"[..], &b""[..]]); + /// ``` + /// + /// When the regex can match the empty string, it splits at every byte + /// position in the haystack. This includes between all UTF-8 code units. + /// (The top-level [`Regex::split`](crate::Regex::split) will only split + /// at valid UTF-8 boundaries.) + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let hay = "☃".as_bytes(); + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![ + /// &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..], + /// ]); + /// ``` + /// + /// Contiguous separators (commonly shows up with whitespace), can lead to + /// possibly surprising behavior. For example, this code is correct: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = b" a b c"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// assert_eq!(got, vec![ + /// &b""[..], &b""[..], &b""[..], &b""[..], + /// &b"a"[..], &b""[..], &b"b"[..], &b"c"[..], + /// ]); + /// ``` + /// + /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want + /// to match contiguous space characters: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r" +").unwrap(); + /// let hay = b" a b c"; + /// let got: Vec<&[u8]> = re.split(hay).collect(); + /// // N.B. This does still include a leading empty span because ' +' + /// // matches at the beginning of the haystack. + /// assert_eq!(got, vec![&b""[..], &b"a"[..], &b"b"[..], &b"c"[..]]); + /// ``` + #[inline] + pub fn split<'r, 'h>(&'r self, haystack: &'h [u8]) -> Split<'r, 'h> { + Split { haystack, it: self.meta.split(haystack) } + } + + /// Returns an iterator of at most `limit` substrings of the haystack + /// given, delimited by a match of the regex. (A `limit` of `0` will return + /// no substrings.) Namely, each element of the iterator corresponds to a + /// part of the haystack that *isn't* matched by the regular expression. + /// The remainder of the haystack that is not split will be the last + /// element in the iterator. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// Although note that the worst case time here has an upper bound given + /// by the `limit` parameter. + /// + /// # Example + /// + /// Get the first two words in some haystack: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\W+").unwrap(); + /// let hay = b"Hey! How are you?"; + /// let fields: Vec<&[u8]> = re.splitn(hay, 3).collect(); + /// assert_eq!(fields, vec![&b"Hey"[..], &b"How"[..], &b"are you?"[..]]); + /// ``` + /// + /// # Examples: more cases + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = b"Mary had a little lamb"; + /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec![&b"Mary"[..], &b"had"[..], &b"a little lamb"[..]]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b""; + /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec![&b""[..]]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b"lionXXtigerXleopard"; + /// let got: Vec<&[u8]> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec![&b"lion"[..], &b""[..], &b"tigerXleopard"[..]]); + /// + /// let re = Regex::new(r"::").unwrap(); + /// let hay = b"lion::tiger::leopard"; + /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect(); + /// assert_eq!(got, vec![&b"lion"[..], &b"tiger::leopard"[..]]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b"abcXdef"; + /// let got: Vec<&[u8]> = re.splitn(hay, 1).collect(); + /// assert_eq!(got, vec![&b"abcXdef"[..]]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b"abcdef"; + /// let got: Vec<&[u8]> = re.splitn(hay, 2).collect(); + /// assert_eq!(got, vec![&b"abcdef"[..]]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = b"abcXdef"; + /// let got: Vec<&[u8]> = re.splitn(hay, 0).collect(); + /// assert!(got.is_empty()); + /// ``` + #[inline] + pub fn splitn<'r, 'h>( + &'r self, + haystack: &'h [u8], + limit: usize, + ) -> SplitN<'r, 'h> { + SplitN { haystack, it: self.meta.splitn(haystack, limit) } + } + + /// Replaces the leftmost-first match in the given haystack with the + /// replacement provided. The replacement can be a regular string (where + /// `$N` and `$name` are expanded to match capture groups) or a function + /// that takes a [`Captures`] and returns the replaced string. + /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// # Replacement string syntax + /// + /// All instances of `$ref` in the replacement string are replaced with + /// the substring corresponding to the capture group identified by `ref`. + /// + /// `ref` may be an integer corresponding to the index of the capture group + /// (counted by order of opening parenthesis where `0` is the entire match) + /// or it can be a name (consisting of letters, digits or underscores) + /// corresponding to a named capture group. + /// + /// If `ref` isn't a valid capture group (whether the name doesn't exist or + /// isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. For example, `$1a` looks up the + /// capture group named `1a` and not the capture group at index `1`. To + /// exert more precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Example + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal string: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"[^01]+").unwrap(); + /// assert_eq!(re.replace(b"1078910", b""), &b"1010"[..]); + /// ``` + /// + /// But anything satisfying the [`Replacer`] trait will work. For example, + /// a closure of type `|&Captures| -> String` provides direct access to the + /// captures corresponding to a match. This allows one to access capturing + /// group matches easily: + /// + /// ``` + /// use regex::bytes::{Captures, Regex}; + /// + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", |caps: &Captures| { + /// let mut buf = vec![]; + /// buf.extend_from_slice(&caps[2]); + /// buf.push(b' '); + /// buf.extend_from_slice(&caps[1]); + /// buf + /// }); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported (as described above) that expands `$name` into the + /// corresponding capture group. Here's the last example, but using this + /// expansion technique with named capture groups: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", b"$first $last"); + /// assert_eq!(result, &b"Bruce Springsteen"[..]); + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement when it is adjacent to some other + /// literal text. For example, if we wanted to join two words together with + /// an underscore: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap(); + /// let result = re.replace(b"deep fried", b"${first}_$second"); + /// assert_eq!(result, &b"deep_fried"[..]); + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// string with [`NoExpand`]: + /// + /// ``` + /// use regex::bytes::{NoExpand, Regex}; + /// + /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); + /// assert_eq!(result, &b"$2 $last"[..]); + /// ``` + /// + /// Using `NoExpand` may also be faster, since the replacement string won't + /// need to be parsed for the `$` syntax. + #[inline] + pub fn replace<'h, R: Replacer>( + &self, + haystack: &'h [u8], + rep: R, + ) -> Cow<'h, [u8]> { + self.replacen(haystack, 1, rep) + } + + /// Replaces all non-overlapping matches in the haystack with the + /// replacement provided. This is the same as calling `replacen` with + /// `limit` set to `0`. + /// + /// The documentation for [`Regex::replace`] goes into more detail about + /// what kinds of replacement strings are supported. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// # Fallibility + /// + /// If you need to write a replacement routine where any individual + /// replacement might "fail," doing so with this API isn't really feasible + /// because there's no way to stop the search process if a replacement + /// fails. Instead, if you need this functionality, you should consider + /// implementing your own replacement routine: + /// + /// ``` + /// use regex::bytes::{Captures, Regex}; + /// + /// fn replace_all<E>( + /// re: &Regex, + /// haystack: &[u8], + /// replacement: impl Fn(&Captures) -> Result<Vec<u8>, E>, + /// ) -> Result<Vec<u8>, E> { + /// let mut new = Vec::with_capacity(haystack.len()); + /// let mut last_match = 0; + /// for caps in re.captures_iter(haystack) { + /// let m = caps.get(0).unwrap(); + /// new.extend_from_slice(&haystack[last_match..m.start()]); + /// new.extend_from_slice(&replacement(&caps)?); + /// last_match = m.end(); + /// } + /// new.extend_from_slice(&haystack[last_match..]); + /// Ok(new) + /// } + /// + /// // Let's replace each word with the number of bytes in that word. + /// // But if we see a word that is "too long," we'll give up. + /// let re = Regex::new(r"\w+").unwrap(); + /// let replacement = |caps: &Captures| -> Result<Vec<u8>, &'static str> { + /// if caps[0].len() >= 5 { + /// return Err("word too long"); + /// } + /// Ok(caps[0].len().to_string().into_bytes()) + /// }; + /// assert_eq!( + /// Ok(b"2 3 3 3?".to_vec()), + /// replace_all(&re, b"hi how are you?", &replacement), + /// ); + /// assert!(replace_all(&re, b"hi there", &replacement).is_err()); + /// ``` + /// + /// # Example + /// + /// This example shows how to flip the order of whitespace (excluding line + /// terminators) delimited fields, and normalizes the whitespace that + /// delimits the fields: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap(); + /// let hay = b" + /// Greetings 1973 + /// Wild\t1973 + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "; + /// let new = re.replace_all(hay, b"$2 $1"); + /// assert_eq!(new, &b" + /// 1973 Greetings + /// 1973 Wild + /// 1975 BornToRun + /// 1978 Darkness + /// 1980 TheRiver + /// "[..]); + /// ``` + #[inline] + pub fn replace_all<'h, R: Replacer>( + &self, + haystack: &'h [u8], + rep: R, + ) -> Cow<'h, [u8]> { + self.replacen(haystack, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in the haystack with + /// the replacement provided. If `limit` is `0`, then all non-overlapping + /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is + /// equivalent to `Regex::replacen(hay, 0, rep)`. + /// + /// The documentation for [`Regex::replace`] goes into more detail about + /// what kinds of replacement strings are supported. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// Although note that the worst case time here has an upper bound given + /// by the `limit` parameter. + /// + /// # Fallibility + /// + /// See the corresponding section in the docs for [`Regex::replace_all`] + /// for tips on how to deal with a replacement routine that can fail. + /// + /// # Example + /// + /// This example shows how to flip the order of whitespace (excluding line + /// terminators) delimited fields, and normalizes the whitespace that + /// delimits the fields. But we only do it for the first two matches. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap(); + /// let hay = b" + /// Greetings 1973 + /// Wild\t1973 + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "; + /// let new = re.replacen(hay, 2, b"$2 $1"); + /// assert_eq!(new, &b" + /// 1973 Greetings + /// 1973 Wild + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "[..]); + /// ``` + #[inline] + pub fn replacen<'h, R: Replacer>( + &self, + haystack: &'h [u8], + limit: usize, + mut rep: R, + ) -> Cow<'h, [u8]> { + // If we know that the replacement doesn't have any capture expansions, + // then we can use the fast path. The fast path can make a tremendous + // difference: + // + // 1) We use `find_iter` instead of `captures_iter`. Not asking for + // captures generally makes the regex engines faster. + // 2) We don't need to look up all of the capture groups and do + // replacements inside the replacement string. We just push it + // at each match and be done with it. + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(haystack).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(haystack); + } + let mut new = Vec::with_capacity(haystack.len()); + let mut last_match = 0; + for (i, m) in it { + new.extend_from_slice(&haystack[last_match..m.start()]); + new.extend_from_slice(&rep); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.extend_from_slice(&haystack[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement needs access to + // capture groups. + let mut it = self.captures_iter(haystack).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(haystack); + } + let mut new = Vec::with_capacity(haystack.len()); + let mut last_match = 0; + for (i, cap) in it { + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.extend_from_slice(&haystack[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.extend_from_slice(&haystack[last_match..]); + Cow::Owned(new) + } +} + +/// A group of advanced or "lower level" search methods. Some methods permit +/// starting the search at a position greater than `0` in the haystack. Other +/// methods permit reusing allocations, for example, when extracting the +/// matches for capture groups. +impl Regex { + /// Returns the end byte offset of the first match in the haystack given. + /// + /// This method may have the same performance characteristics as + /// `is_match`. Behaviorlly, it doesn't just report whether it match + /// occurs, but also the end offset for a match. In particular, the offset + /// returned *may be shorter* than the proper end of the leftmost-first + /// match that you would find via [`Regex::find`]. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change based on internal heuristics. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// haystack, but `shortest_match` *may* give up as soon as it sees the + /// first `a`. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"a+").unwrap(); + /// let offset = re.shortest_match(b"aaaaa").unwrap(); + /// assert_eq!(offset, 1); + /// ``` + #[inline] + pub fn shortest_match(&self, haystack: &[u8]) -> Option<usize> { + self.shortest_match_at(haystack, 0) + } + + /// Returns the same as `shortest_match`, but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. + /// + /// If a match is found, the offset returned is relative to the beginning + /// of the haystack, not the beginning of the search. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = b"eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(re.shortest_match(&hay[2..]), Some(4)); + /// // No match because the assertions take the context into account. + /// assert_eq!(re.shortest_match_at(hay, 2), None); + /// ``` + #[inline] + pub fn shortest_match_at( + &self, + haystack: &[u8], + start: usize, + ) -> Option<usize> { + let input = + Input::new(haystack).earliest(true).span(start..haystack.len()); + self.meta.search_half(&input).map(|hm| hm.offset()) + } + + /// Returns the same as [`Regex::is_match`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = b"eschew"; + /// // We get a match here, but it's probably not intended. + /// assert!(re.is_match(&hay[2..])); + /// // No match because the assertions take the context into account. + /// assert!(!re.is_match_at(hay, 2)); + /// ``` + #[inline] + pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool { + self.meta.is_match(Input::new(haystack).span(start..haystack.len())) + } + + /// Returns the same as [`Regex::find`], but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = b"eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4)); + /// // No match because the assertions take the context into account. + /// assert_eq!(re.find_at(hay, 2), None); + /// ``` + #[inline] + pub fn find_at<'h>( + &self, + haystack: &'h [u8], + start: usize, + ) -> Option<Match<'h>> { + let input = Input::new(haystack).span(start..haystack.len()); + self.meta.find(input).map(|m| Match::new(haystack, m.start(), m.end())) + } + + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = b"eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], b"chew"); + /// // No match because the assertions take the context into account. + /// assert!(re.captures_at(hay, 2).is_none()); + /// ``` + #[inline] + pub fn captures_at<'h>( + &self, + haystack: &'h [u8], + start: usize, + ) -> Option<Captures<'h>> { + let input = Input::new(haystack).span(start..haystack.len()); + let mut caps = self.meta.create_captures(); + self.meta.captures(input, &mut caps); + if caps.is_match() { + let static_captures_len = self.static_captures_len(); + Some(Captures { haystack, caps, static_captures_len }) + } else { + None + } + } + + /// This is like [`Regex::captures`], but writes the byte offsets of each + /// capture group match into the locations given. + /// + /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], + /// but does *not* store a reference to the haystack. This makes its API + /// a bit lower level and less convenience. But in exchange, callers + /// may allocate their own `CaptureLocations` and reuse it for multiple + /// searches. This may be helpful if allocating a `Captures` shows up in a + /// profile as too costly. + /// + /// To create a `CaptureLocations` value, use the + /// [`Regex::capture_locations`] method. + /// + /// This also the overall match if one was found. When a match is found, + /// its offsets are also always stored in `locs` at index `0`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert!(re.captures_read(&mut locs, b"id=foo123").is_some()); + /// assert_eq!(Some((0, 9)), locs.get(0)); + /// assert_eq!(Some((0, 2)), locs.get(1)); + /// assert_eq!(Some((3, 9)), locs.get(2)); + /// ``` + #[inline] + pub fn captures_read<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h [u8], + ) -> Option<Match<'h>> { + self.captures_read_at(locs, haystack, 0) + } + + /// Returns the same as [`Regex::captures_read`], but starts the search at + /// the given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = b"eschew"; + /// let mut locs = re.capture_locations(); + /// // We get a match here, but it's probably not intended. + /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some()); + /// // No match because the assertions take the context into account. + /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none()); + /// ``` + #[inline] + pub fn captures_read_at<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h [u8], + start: usize, + ) -> Option<Match<'h>> { + let input = Input::new(haystack).span(start..haystack.len()); + self.meta.search_captures(&input, &mut locs.0); + locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end())) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + #[inline] + pub fn read_captures_at<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h [u8], + start: usize, + ) -> Option<Match<'h>> { + self.captures_read_at(locs, haystack, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"foo\w+bar").unwrap(); + /// assert_eq!(re.as_str(), r"foo\w+bar"); + /// ``` + #[inline] + pub fn as_str(&self) -> &str { + &self.pattern + } + + /// Returns an iterator over the capture names in this regex. + /// + /// The iterator returned yields elements of type `Option<&str>`. That is, + /// the iterator yields values for all capture groups, even ones that are + /// unnamed. The order of the groups corresponds to the order of the group's + /// corresponding opening parenthesis. + /// + /// The first element of the iterator always yields the group corresponding + /// to the overall match, and this group is always unnamed. Therefore, the + /// iterator always yields at least one group. + /// + /// # Example + /// + /// This shows basic usage with a mix of named and unnamed capture groups: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), Some(Some("a"))); + /// assert_eq!(names.next(), Some(Some("b"))); + /// assert_eq!(names.next(), Some(None)); + /// // the '(?:.)' group is non-capturing and so doesn't appear here! + /// assert_eq!(names.next(), Some(Some("c"))); + /// assert_eq!(names.next(), None); + /// ``` + /// + /// The iterator always yields at least one element, even for regexes with + /// no capture groups and even for regexes that can never match: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), None); + /// + /// let re = Regex::new(r"[a&&b]").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), None); + /// ``` + #[inline] + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO)) + } + + /// Returns the number of captures groups in this regex. + /// + /// This includes all named and unnamed groups, including the implicit + /// unnamed group that is always present and corresponds to the entire + /// match. + /// + /// Since the implict unnamed group is always included in this length, the + /// length returned is guaranteed to be greater than zero. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"foo").unwrap(); + /// assert_eq!(1, re.captures_len()); + /// + /// let re = Regex::new(r"(foo)").unwrap(); + /// assert_eq!(2, re.captures_len()); + /// + /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); + /// assert_eq!(5, re.captures_len()); + /// + /// let re = Regex::new(r"[a&&b]").unwrap(); + /// assert_eq!(1, re.captures_len()); + /// ``` + #[inline] + pub fn captures_len(&self) -> usize { + self.meta.group_info().group_len(PatternID::ZERO) + } + + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.meta.static_captures_len() + } + + /// Returns a fresh allocated set of capture locations that can + /// be reused in multiple calls to [`Regex::captures_read`] or + /// [`Regex::captures_read_at`]. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(.)(.)(\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert!(re.captures_read(&mut locs, b"Padron").is_some()); + /// assert_eq!(locs.get(0), Some((0, 6))); + /// assert_eq!(locs.get(1), Some((0, 1))); + /// assert_eq!(locs.get(2), Some((1, 2))); + /// assert_eq!(locs.get(3), Some((2, 6))); + /// ``` + #[inline] + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.meta.create_captures()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate uses this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + #[inline] + pub fn locations(&self) -> CaptureLocations { + self.capture_locations() + } +} + +/// Represents a single match of a regex in a haystack. +/// +/// A `Match` contains both the start and end byte offsets of the match and the +/// actual substring corresponding to the range of those byte offsets. It is +/// guaranteed that `start <= end`. When `start == end`, the match is empty. +/// +/// Unlike the top-level `Match` type, this `Match` type is produced by APIs +/// that search `&[u8]` haystacks. This means that the offsets in a `Match` can +/// point to anywhere in the haystack, including in a place that splits the +/// UTF-8 encoding of a Unicode scalar value. +/// +/// The lifetime parameter `'h` refers to the lifetime of the matched of the +/// haystack that this match was produced from. +/// +/// # Numbering +/// +/// The byte offsets in a `Match` form a half-open interval. That is, the +/// start of the range is inclusive and the end of the range is exclusive. +/// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte +/// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and +/// `6` corresponds to `x`, which is one past the end of the match. This +/// corresponds to the same kind of slicing that Rust uses. +/// +/// For more on why this was chosen over other schemes (aside from being +/// consistent with how Rust the language works), see [this discussion] and +/// [Dijkstra's note on a related topic][note]. +/// +/// [this discussion]: https://github.com/rust-lang/regex/discussions/866 +/// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html +/// +/// # Example +/// +/// This example shows the value of each of the methods on `Match` for a +/// particular search. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"\p{Greek}+").unwrap(); +/// let hay = "Greek: αβγδ".as_bytes(); +/// let m = re.find(hay).unwrap(); +/// assert_eq!(7, m.start()); +/// assert_eq!(15, m.end()); +/// assert!(!m.is_empty()); +/// assert_eq!(8, m.len()); +/// assert_eq!(7..15, m.range()); +/// assert_eq!("αβγδ".as_bytes(), m.as_bytes()); +/// ``` +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct Match<'h> { + haystack: &'h [u8], + start: usize, + end: usize, +} + +impl<'h> Match<'h> { + /// Returns the byte offset of the start of the match in the haystack. The + /// start of the match corresponds to the position where the match begins + /// and includes the first byte in the match. + /// + /// It is guaranteed that `Match::start() <= Match::end()`. + /// + /// Unlike the top-level `Match` type, the start offset may appear anywhere + /// in the haystack. This includes between the code units of a UTF-8 + /// encoded Unicode scalar value. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the byte offset of the end of the match in the haystack. The + /// end of the match corresponds to the byte immediately following the last + /// byte in the match. This means that `&slice[start..end]` works as one + /// would expect. + /// + /// It is guaranteed that `Match::start() <= Match::end()`. + /// + /// Unlike the top-level `Match` type, the start offset may appear anywhere + /// in the haystack. This includes between the code units of a UTF-8 + /// encoded Unicode scalar value. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns true if and only if this match has a length of zero. + /// + /// Note that an empty match can only occur when the regex itself can + /// match the empty string. Here are some examples of regexes that can + /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`, + /// `(foo|\d+|quux)?`. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.start..self.end + } + + /// Returns the substring of the haystack that matched. + #[inline] + pub fn as_bytes(&self) -> &'h [u8] { + &self.haystack[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'h [u8], start: usize, end: usize) -> Match<'h> { + Match { haystack, start, end } + } +} + +impl<'h> core::fmt::Debug for Match<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut fmt = f.debug_struct("Match"); + fmt.field("start", &self.start).field("end", &self.end); + if let Ok(s) = core::str::from_utf8(self.as_bytes()) { + fmt.field("bytes", &s); + } else { + // FIXME: It would be nice if this could be printed as a string + // with invalid UTF-8 replaced with hex escapes. A alloc would + // probably okay if that makes it easier, but regex-automata does + // (at time of writing) have internal routines that do this. So + // maybe we should expose them. + fmt.field("bytes", &self.as_bytes()); + } + fmt.finish() + } +} + +impl<'h> From<Match<'h>> for &'h [u8] { + fn from(m: Match<'h>) -> &'h [u8] { + m.as_bytes() + } +} + +impl<'h> From<Match<'h>> for core::ops::Range<usize> { + fn from(m: Match<'h>) -> core::ops::Range<usize> { + m.range() + } +} + +/// Represents the capture groups for a single match. +/// +/// Capture groups refer to parts of a regex enclosed in parentheses. They can +/// be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. For +/// example, say you want to match the individual letters in a 5-letter word: +/// +/// ```text +/// (?<first>\w)(\w)(?:\w)\w(?<last>\w) +/// ``` +/// +/// This regex has 4 capture groups: +/// +/// * The group at index `0` corresponds to the overall match. It is always +/// present in every match and never has a name. +/// * The group at index `1` with name `first` corresponding to the first +/// letter. +/// * The group at index `2` with no name corresponding to the second letter. +/// * The group at index `3` with name `last` corresponding to the fifth and +/// last letter. +/// +/// Notice that `(?:\w)` was not listed above as a capture group despite it +/// being enclosed in parentheses. That's because `(?:pattern)` is a special +/// syntax that permits grouping but *without* capturing. The reason for not +/// treating it as a capture is that tracking and reporting capture groups +/// requires additional state that may lead to slower searches. So using as few +/// capture groups as possible can help performance. (Although the difference +/// in performance of a couple of capture groups is likely immaterial.) +/// +/// Values with this type are created by [`Regex::captures`] or +/// [`Regex::captures_iter`]. +/// +/// `'h` is the lifetime of the haystack that these captures were matched from. +/// +/// # Example +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap(); +/// let caps = re.captures(b"toady").unwrap(); +/// assert_eq!(b"toady", &caps[0]); +/// assert_eq!(b"t", &caps["first"]); +/// assert_eq!(b"o", &caps[2]); +/// assert_eq!(b"y", &caps["last"]); +/// ``` +pub struct Captures<'h> { + haystack: &'h [u8], + caps: captures::Captures, + static_captures_len: Option<usize>, +} + +impl<'h> Captures<'h> { + /// Returns the `Match` associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group did + /// not participate in the match, then `None` is returned. + /// + /// When `i == 0`, this is guaranteed to return a non-`None` value. + /// + /// # Examples + /// + /// Get the substring that matched with a default of an empty string if the + /// group didn't participate in the match: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures(b"abc123").unwrap(); + /// + /// let substr1 = caps.get(1).map_or(&b""[..], |m| m.as_bytes()); + /// let substr2 = caps.get(2).map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(substr1, b"123"); + /// assert_eq!(substr2, b""); + /// ``` + #[inline] + pub fn get(&self, i: usize) -> Option<Match<'h>> { + self.caps + .get_group(i) + .map(|sp| Match::new(self.haystack, sp.start, sp.end)) + } + + /// Returns the `Match` associated with the capture group named `name`. If + /// `name` isn't a valid capture group or it refers to a group that didn't + /// match, then `None` is returned. + /// + /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime + /// matches the lifetime of the haystack in this `Captures` value. + /// Conversely, the substring returned by `caps["name"]` has a lifetime + /// of the `Captures` value, which is likely shorter than the lifetime of + /// the haystack. In some cases, it may be necessary to use this method to + /// access the matching substring instead of the `caps["name"]` notation. + /// + /// # Examples + /// + /// Get the substring that matched with a default of an empty string if the + /// group didn't participate in the match: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new( + /// r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))", + /// ).unwrap(); + /// let caps = re.captures(b"abc123").unwrap(); + /// + /// let numbers = caps.name("numbers").map_or(&b""[..], |m| m.as_bytes()); + /// let letters = caps.name("letters").map_or(&b""[..], |m| m.as_bytes()); + /// assert_eq!(numbers, b"123"); + /// assert_eq!(letters, b""); + /// ``` + #[inline] + pub fn name(&self, name: &str) -> Option<Match<'h>> { + self.caps + .get_group_by_name(name) + .map(|sp| Match::new(self.haystack, sp.start, sp.end)) + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups. + /// + /// This returns a tuple where the first element corresponds to the full + /// substring of the haystack that matched the regex. The second element is + /// an array of substrings, with each corresponding to the to the substring + /// that matched for a particular capture group. + /// + /// # Panics + /// + /// This panics if the number of possible matching groups in this + /// `Captures` value is not fixed to `N` in all circumstances. + /// More precisely, this routine only works when `N` is equivalent to + /// [`Regex::static_captures_len`]. + /// + /// Stated more plainly, if the number of matching capture groups in a + /// regex can vary from match to match, then this function always panics. + /// + /// For example, `(a)(b)|(c)` could produce two matching capture groups + /// or one matching capture group for any given match. Therefore, one + /// cannot use `extract` with such a pattern. + /// + /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because + /// the number of capture groups in every match is always equivalent, + /// even if the capture _indices_ in each match are not. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); + /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; + /// let Some((full, [year, month, day])) = + /// re.captures(hay).map(|caps| caps.extract()) else { return }; + /// assert_eq!(b"2010-03-14", full); + /// assert_eq!(b"2010", year); + /// assert_eq!(b"03", month); + /// assert_eq!(b"14", day); + /// ``` + /// + /// # Example: iteration + /// + /// This example shows how to use this method when iterating over all + /// `Captures` matches in a haystack. + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); + /// let hay = b"1973-01-05, 1975-08-25 and 1980-10-18"; + /// + /// let mut dates: Vec<(&[u8], &[u8], &[u8])> = vec![]; + /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) { + /// dates.push((y, m, d)); + /// } + /// assert_eq!(dates, vec![ + /// (&b"1973"[..], &b"01"[..], &b"05"[..]), + /// (&b"1975"[..], &b"08"[..], &b"25"[..]), + /// (&b"1980"[..], &b"10"[..], &b"18"[..]), + /// ]); + /// ``` + /// + /// # Example: parsing different formats + /// + /// This API is particularly useful when you need to extract a particular + /// value that might occur in a different format. Consider, for example, + /// an identifier that might be in double quotes or single quotes: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap(); + /// let hay = br#"The first is id:"foo" and the second is id:'bar'."#; + /// let mut ids = vec![]; + /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) { + /// ids.push(id); + /// } + /// assert_eq!(ids, vec![b"foo", b"bar"]); + /// ``` + pub fn extract<const N: usize>(&self) -> (&'h [u8], [&'h [u8]; N]) { + let len = self + .static_captures_len + .expect("number of capture groups can vary in a match") + .checked_sub(1) + .expect("number of groups is always greater than zero"); + assert_eq!(N, len, "asked for {} groups, but must ask for {}", N, len); + // The regex-automata variant of extract is a bit more permissive. + // It doesn't require the number of matching capturing groups to be + // static, and you can even request fewer groups than what's there. So + // this is guaranteed to never panic because we've asserted above that + // the user has requested precisely the number of groups that must be + // present in any match for this regex. + self.caps.extract_bytes(self.haystack) + } + + /// Expands all instances of `$ref` in `replacement` to the corresponding + /// capture group, and writes them to the `dst` buffer given. A `ref` can + /// be a capture group index or a name. If `ref` doesn't refer to a capture + /// group that participated in the match, then it is replaced with the + /// empty string. + /// + /// # Format + /// + /// The format of the replacement string supports two different kinds of + /// capture references: unbraced and braced. + /// + /// For the unbraced format, the format supported is `$ref` where `name` + /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always + /// the longest possible parse. So for example, `$1a` corresponds to the + /// capture group named `1a` and not the capture group at index `1`. If + /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index + /// itself and not a name. + /// + /// For the braced format, the format supported is `${ref}` where `ref` can + /// be any sequence of bytes except for `}`. If no closing brace occurs, + /// then it is not considered a capture reference. As with the unbraced + /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture + /// group index and not a name. + /// + /// The braced format is useful for exerting precise control over the name + /// of the capture reference. For example, `${1}a` corresponds to the + /// capture group reference `1` followed by the letter `a`, where as `$1a` + /// (as mentioned above) corresponds to the capture group reference `1a`. + /// The braced format is also useful for expressing capture group names + /// that use characters not supported by the unbraced format. For example, + /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`. + /// + /// If a capture group reference is found and it does not refer to a valid + /// capture group, then it will be replaced with the empty string. + /// + /// To write a literal `$`, use `$$`. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new( + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// ).unwrap(); + /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; + /// let caps = re.captures(hay).unwrap(); + /// + /// let mut dst = vec![]; + /// caps.expand(b"year=$year, month=$month, day=$day", &mut dst); + /// assert_eq!(dst, b"year=2010, month=03, day=14"); + /// ``` + #[inline] + pub fn expand(&self, replacement: &[u8], dst: &mut Vec<u8>) { + self.caps.interpolate_bytes_into(self.haystack, replacement, dst); + } + + /// Returns an iterator over all capture groups. This includes both + /// matching and non-matching groups. + /// + /// The iterator always yields at least one matching group: the first group + /// (at index `0`) with no name. Subsequent groups are returned in the order + /// of their opening parenthesis in the regex. + /// + /// The elements yielded have type `Option<Match<'h>>`, where a non-`None` + /// value is present if the capture group matches. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); + /// let caps = re.captures(b"AZ").unwrap(); + /// + /// let mut it = caps.iter(); + /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"AZ"[..])); + /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"A"[..])); + /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), None); + /// assert_eq!(it.next().unwrap().map(|m| m.as_bytes()), Some(&b"Z"[..])); + /// assert_eq!(it.next(), None); + /// ``` + #[inline] + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> { + SubCaptureMatches { haystack: self.haystack, it: self.caps.iter() } + } + + /// Returns the total number of capture groups. This includes both + /// matching and non-matching groups. + /// + /// The length returned is always equivalent to the number of elements + /// yielded by [`Captures::iter`]. Consequently, the length is always + /// greater than zero since every `Captures` value always includes the + /// match for the entire regex. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); + /// let caps = re.captures(b"AZ").unwrap(); + /// assert_eq!(caps.len(), 4); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.caps.group_len() + } +} + +impl<'h> core::fmt::Debug for Captures<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + /// A little helper type to provide a nice map-like debug + /// representation for our capturing group spans. + /// + /// regex-automata has something similar, but it includes the pattern + /// ID in its debug output, which is confusing. It also doesn't include + /// that strings that match because a regex-automata `Captures` doesn't + /// borrow the haystack. + struct CapturesDebugMap<'a> { + caps: &'a Captures<'a>, + } + + impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut map = f.debug_map(); + let names = + self.caps.caps.group_info().pattern_names(PatternID::ZERO); + for (group_index, maybe_name) in names.enumerate() { + let key = Key(group_index, maybe_name); + match self.caps.get(group_index) { + None => map.entry(&key, &None::<()>), + Some(mat) => map.entry(&key, &Value(mat)), + }; + } + map.finish() + } + } + + struct Key<'a>(usize, Option<&'a str>); + + impl<'a> core::fmt::Debug for Key<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.0)?; + if let Some(name) = self.1 { + write!(f, "/{:?}", name)?; + } + Ok(()) + } + } + + struct Value<'a>(Match<'a>); + + impl<'a> core::fmt::Debug for Value<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + use regex_automata::util::escape::DebugHaystack; + + write!( + f, + "{}..{}/{:?}", + self.0.start(), + self.0.end(), + DebugHaystack(self.0.as_bytes()) + ) + } + } + + f.debug_tuple("Captures") + .field(&CapturesDebugMap { caps: self }) + .finish() + } +} + +/// Get a matching capture group's haystack substring by index. +/// +/// The haystack substring returned can't outlive the `Captures` object if this +/// method is used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it). To work around this limitation, do that, use +/// [`Captures::get`] instead. +/// +/// `'h` is the lifetime of the matched haystack, but the lifetime of the +/// `&str` returned by this implementation is the lifetime of the `Captures` +/// value itself. +/// +/// # Panics +/// +/// If there is no matching group at the given index. +impl<'h> core::ops::Index<usize> for Captures<'h> { + type Output = [u8]; + + // The lifetime is written out to make it clear that the &str returned + // does NOT have a lifetime equivalent to 'h. + fn index<'a>(&'a self, i: usize) -> &'a [u8] { + self.get(i) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a matching capture group's haystack substring by name. +/// +/// The haystack substring returned can't outlive the `Captures` object if this +/// method is used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it). To work around this limitation, do that, use +/// [`Captures::get`] instead. +/// +/// `'h` is the lifetime of the matched haystack, but the lifetime of the +/// `&str` returned by this implementation is the lifetime of the `Captures` +/// value itself. +/// +/// `'n` is the lifetime of the group name used to index the `Captures` value. +/// +/// # Panics +/// +/// If there is no matching group at the given name. +impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { + type Output = [u8]; + + fn index<'a>(&'a self, name: &'n str) -> &'a [u8] { + self.name(name) + .map(|m| m.as_bytes()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// A low level representation of the byte offsets of each capture group. +/// +/// You can think of this as a lower level [`Captures`], where this type does +/// not support named capturing groups directly and it does not borrow the +/// haystack that these offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs such +/// as [`Regex::captures_read`], which permits amortizing the allocation in +/// which capture match offsets are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// [`Regex::capture_locations`] method. The value returned can then be reused +/// in subsequent searches for that regex. Using it for other regexes may +/// result in a panic or otherwise incorrect results. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::bytes::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` +#[derive(Clone, Debug)] +pub struct CaptureLocations(captures::Captures); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end byte offsets of the capture group at index + /// `i`. This returns `None` if `i` is not a valid capture group or if the + /// capture group did not match. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); + /// assert_eq!(Some((0, 17)), locs.get(0)); + /// assert_eq!(Some((0, 5)), locs.get(1)); + /// assert_eq!(Some((6, 17)), locs.get(2)); + /// ``` + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.get_group(i).map(|sp| (sp.start, sp.end)) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// That is, the length returned is unaffected by the result of a search. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert_eq!(3, locs.len()); + /// re.captures_read(&mut locs, b"Bruce Springsteen").unwrap(); + /// assert_eq!(3, locs.len()); + /// ``` + /// + /// Notice that the length is always at least `1`, regardless of the regex: + /// + /// ``` + /// use regex::bytes::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let locs = re.capture_locations(); + /// assert_eq!(1, locs.len()); + /// + /// // [a&&b] is a regex that never matches anything. + /// let re = Regex::new(r"[a&&b]").unwrap(); + /// let locs = re.capture_locations(); + /// assert_eq!(1, locs.len()); + /// ``` + #[inline] + pub fn len(&self) -> usize { + // self.0.group_len() returns 0 if the underlying captures doesn't + // represent a match, but the behavior guaranteed for this method is + // that the length doesn't change based on a match or not. + self.0.group_info().group_len(PatternID::ZERO) + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// An iterator over all non-overlapping matches in a haystack. +/// +/// This iterator yields [`Match`] values. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the haystack. +/// +/// This iterator is created by [`Regex::find_iter`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +#[derive(Debug)] +pub struct Matches<'r, 'h> { + haystack: &'h [u8], + it: meta::FindMatches<'r, 'h>, +} + +impl<'r, 'h> Iterator for Matches<'r, 'h> { + type Item = Match<'h>; + + #[inline] + fn next(&mut self) -> Option<Match<'h>> { + self.it + .next() + .map(|sp| Match::new(self.haystack, sp.start(), sp.end())) + } + + #[inline] + fn count(self) -> usize { + // This can actually be up to 2x faster than calling `next()` until + // completion, because counting matches when using a DFA only requires + // finding the end of each match. But returning a `Match` via `next()` + // requires the start of each match which, with a DFA, requires a + // reverse forward scan to find it. + self.it.count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {} + +/// An iterator over all non-overlapping capture matches in a haystack. +/// +/// This iterator yields [`Captures`] values. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the matched string. +/// +/// This iterator is created by [`Regex::captures_iter`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +#[derive(Debug)] +pub struct CaptureMatches<'r, 'h> { + haystack: &'h [u8], + it: meta::CapturesMatches<'r, 'h>, +} + +impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> { + type Item = Captures<'h>; + + #[inline] + fn next(&mut self) -> Option<Captures<'h>> { + let static_captures_len = self.it.regex().static_captures_len(); + self.it.next().map(|caps| Captures { + haystack: self.haystack, + caps, + static_captures_len, + }) + } + + #[inline] + fn count(self) -> usize { + // This can actually be up to 2x faster than calling `next()` until + // completion, because counting matches when using a DFA only requires + // finding the end of each match. But returning a `Match` via `next()` + // requires the start of each match which, with a DFA, requires a + // reverse forward scan to find it. + self.it.count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {} + +/// An iterator over all substrings delimited by a regex match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the byte string being split. +/// +/// This iterator is created by [`Regex::split`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +#[derive(Debug)] +pub struct Split<'r, 'h> { + haystack: &'h [u8], + it: meta::Split<'r, 'h>, +} + +impl<'r, 'h> Iterator for Split<'r, 'h> { + type Item = &'h [u8]; + + #[inline] + fn next(&mut self) -> Option<&'h [u8]> { + self.it.next().map(|span| &self.haystack[span]) + } +} + +impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {} + +/// An iterator over at most `N` substrings delimited by a regex match. +/// +/// The last substring yielded by this iterator will be whatever remains after +/// `N-1` splits. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the byte string being split. +/// +/// This iterator is created by [`Regex::splitn`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +/// +/// Although note that the worst case time here has an upper bound given +/// by the `limit` parameter to [`Regex::splitn`]. +#[derive(Debug)] +pub struct SplitN<'r, 'h> { + haystack: &'h [u8], + it: meta::SplitN<'r, 'h>, +} + +impl<'r, 'h> Iterator for SplitN<'r, 'h> { + type Item = &'h [u8]; + + #[inline] + fn next(&mut self) -> Option<&'h [u8]> { + self.it.next().map(|span| &self.haystack[span]) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } +} + +impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} + +/// An iterator over the names of all capture groups in a regex. +/// +/// This iterator yields values of type `Option<&str>` in order of the opening +/// capture group parenthesis in the regex pattern. `None` is yielded for +/// groups with no name. The first element always corresponds to the implicit +/// and unnamed group for the overall match. +/// +/// `'r` is the lifetime of the compiled regular expression. +/// +/// This iterator is created by [`Regex::capture_names`]. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + #[inline] + fn next(&mut self) -> Option<Option<&'r str>> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> core::iter::FusedIterator for CaptureNames<'r> {} + +/// An iterator over all group matches in a [`Captures`] value. +/// +/// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the +/// lifetime of the haystack that the matches are for. The order of elements +/// yielded corresponds to the order of the opening parenthesis for the group +/// in the regex pattern. `None` is yielded for groups that did not participate +/// in the match. +/// +/// The first element always corresponds to the implicit group for the overall +/// match. Since this iterator is created by a [`Captures`] value, and a +/// `Captures` value is only created when a match occurs, it follows that the +/// first element yielded by this iterator is guaranteed to be non-`None`. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value that +/// created this iterator, and the lifetime `'h` corresponds to the originally +/// matched haystack. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 'h> { + haystack: &'h [u8], + it: captures::CapturesPatternIter<'c>, +} + +impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> { + type Item = Option<Match<'h>>; + + #[inline] + fn next(&mut self) -> Option<Option<Match<'h>>> { + self.it.next().map(|group| { + group.map(|sp| Match::new(self.haystack, sp.start, sp.end)) + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.it.count() + } +} + +impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {} + +impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {} + +/// A trait for types that can be used to replace matches in a haystack. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&[u8]` along with other +/// variants of byte string types, as well as `FnMut(&Captures) -> Vec<u8>` (or +/// any `FnMut(&Captures) -> T` where `T: AsRef<[u8]>`). Those cover most use +/// cases, but callers can implement this trait directly if necessary. +/// +/// # Example +/// +/// This example shows a basic implementation of the `Replacer` trait. This can +/// be done much more simply using the replacement byte string interpolation +/// support (e.g., `$first $last`), but this approach avoids needing to parse +/// the replacement byte string at all. +/// +/// ``` +/// use regex::bytes::{Captures, Regex, Replacer}; +/// +/// struct NameSwapper; +/// +/// impl Replacer for NameSwapper { +/// fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { +/// dst.extend_from_slice(&caps["first"]); +/// dst.extend_from_slice(b" "); +/// dst.extend_from_slice(&caps["last"]); +/// } +/// } +/// +/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); +/// let result = re.replace(b"Springsteen, Bruce", NameSwapper); +/// assert_eq!(result, &b"Bruce Springsteen"[..]); +/// ``` +pub trait Replacer { + /// Appends possibly empty data to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to have + /// a match at capture group `0`. + /// + /// For example, a no-op replacement would be + /// `dst.extend_from_slice(&caps[0])`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>); + + /// Return a fixed unchanging replacement byte string. + /// + /// When doing replacements, if access to [`Captures`] is not needed (e.g., + /// the replacement byte string does not need `$` expansion), then it can + /// be beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to a replacement routine + /// such as [`Regex::replace_all`]. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + None + } + + /// Returns a type that implements `Replacer`, but that borrows and wraps + /// this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &[u8], + /// mut rep: R, + /// ) -> Vec<u8> { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +impl<'a, const N: usize> Replacer for &'a [u8; N] { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(&**self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<const N: usize> Replacer for [u8; N] { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(&*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a [u8] { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl Replacer for Vec<u8> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, [u8]> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + caps.expand(self.as_ref(), dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + no_expansion(self) + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<[u8]>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice((*self)(caps).as_ref()); + } +} + +/// A by-reference adaptor for a [`Replacer`]. +/// +/// This permits reusing the same `Replacer` value in multiple calls to a +/// replacement routine like [`Regex::replace_all`]. +/// +/// This type is created by [`Replacer::by_ref`]. +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut Vec<u8>) { + self.0.replace_append(caps, dst) + } + + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, [u8]>> { + self.0.no_expansion() + } +} + +/// A helper type for forcing literal string replacement. +/// +/// It can be used with routines like [`Regex::replace`] and +/// [`Regex::replace_all`] to do a literal string replacement without expanding +/// `$name` to their corresponding capture groups. This can be both convenient +/// (to avoid escaping `$`, for example) and faster (since capture groups +/// don't need to be found). +/// +/// `'s` is the lifetime of the literal string to use. +/// +/// # Example +/// +/// ``` +/// use regex::bytes::{NoExpand, Regex}; +/// +/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); +/// let result = re.replace(b"Springsteen, Bruce", NoExpand(b"$2 $last")); +/// assert_eq!(result, &b"$2 $last"[..]); +/// ``` +#[derive(Clone, Debug)] +pub struct NoExpand<'s>(pub &'s [u8]); + +impl<'s> Replacer for NoExpand<'s> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut Vec<u8>) { + dst.extend_from_slice(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, [u8]>> { + Some(Cow::Borrowed(self.0)) + } +} + +/// Quickly checks the given replacement string for whether interpolation +/// should be done on it. It returns `None` if a `$` was found anywhere in the +/// given string, which suggests interpolation needs to be done. But if there's +/// no `$` anywhere, then interpolation definitely does not need to be done. In +/// that case, the given string is returned as a borrowed `Cow`. +/// +/// This is meant to be used to implement the `Replacer::no_expandsion` method +/// in its various trait impls. +fn no_expansion<T: AsRef<[u8]>>(replacement: &T) -> Option<Cow<'_, [u8]>> { + let replacement = replacement.as_ref(); + match crate::find_byte::find_byte(b'$', replacement) { + Some(_) => None, + None => Some(Cow::Borrowed(replacement)), + } +} diff --git a/src/regex/mod.rs b/src/regex/mod.rs new file mode 100644 index 0000000000..93fadec8bf --- /dev/null +++ b/src/regex/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod bytes; +pub(crate) mod string; diff --git a/src/regex/string.rs b/src/regex/string.rs new file mode 100644 index 0000000000..438af7bebf --- /dev/null +++ b/src/regex/string.rs @@ -0,0 +1,2561 @@ +use alloc::{borrow::Cow, string::String, sync::Arc}; + +use regex_automata::{meta, util::captures, Input, PatternID}; + +use crate::{error::Error, RegexBuilder}; + +/// A compiled regular expression for searching Unicode haystacks. +/// +/// A `Regex` can be used to search haystacks, split haystacks into substrings +/// or replace substrings in a haystack with a different substring. All +/// searching is done with an implicit `(?s:.)*?` at the beginning and end of +/// an pattern. To force an expression to match the whole string (or a prefix +/// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`). +/// +/// While this crate will handle Unicode strings (whether in the regular +/// expression or in the haystack), all positions returned are **byte +/// offsets**. Every byte offset is guaranteed to be at a Unicode code point +/// boundary. That is, all offsets returned by the `Regex` API are guaranteed +/// to be ranges that can slice a `&str` without panicking. If you want to +/// relax this requirement, then you must search `&[u8]` haystacks with a +/// [`bytes::Regex`](crate::bytes::Regex). +/// +/// The only methods that allocate new strings are the string replacement +/// methods. All other methods (searching and splitting) return borrowed +/// references into the haystack given. +/// +/// # Example +/// +/// Find the offsets of a US phone number: +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); +/// let m = re.find("phone: 111-222-3333").unwrap(); +/// assert_eq!(7..19, m.range()); +/// ``` +/// +/// # Example: extracting capture groups +/// +/// A common way to use regexes is with capture groups. That is, instead of +/// just looking for matches of an entire regex, parentheses are used to create +/// groups that represent part of the match. +/// +/// For example, consider a haystack with multiple lines, and each line has +/// three whitespace delimited fields where the second field is expected to be +/// a number and the third field a boolean. To make this convenient, we use +/// the [`Captures::extract`] API to put the strings that match each group +/// into a fixed size array: +/// +/// ``` +/// use regex::Regex; +/// +/// let hay = " +/// rabbit 54 true +/// groundhog 2 true +/// does not match +/// fox 109 false +/// "; +/// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap(); +/// let mut fields: Vec<(&str, i64, bool)> = vec![]; +/// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) { +/// fields.push((f1, f2.parse()?, f3.parse()?)); +/// } +/// assert_eq!(fields, vec![ +/// ("rabbit", 54, true), +/// ("groundhog", 2, true), +/// ("fox", 109, false), +/// ]); +/// +/// # Ok::<(), Box<dyn std::error::Error>>(()) +/// ``` +/// +/// # Example: searching with the `Pattern` trait +/// +/// **Note**: This section requires that this crate is compiled with the +/// `pattern` Cargo feature enabled, which **requires nightly Rust**. +/// +/// Since `Regex` implements `Pattern` from the standard library, one can +/// use regexes with methods defined on `&str`. For example, `is_match`, +/// `find`, `find_iter` and `split` can, in some cases, be replaced with +/// `str::contains`, `str::find`, `str::match_indices` and `str::split`. +/// +/// Here are some examples: +/// +/// ```ignore +/// use regex::Regex; +/// +/// let re = Regex::new(r"\d+").unwrap(); +/// let hay = "a111b222c"; +/// +/// assert!(hay.contains(&re)); +/// assert_eq!(hay.find(&re), Some(1)); +/// assert_eq!(hay.match_indices(&re).collect::<Vec<_>>(), vec![ +/// (1, "111"), +/// (5, "222"), +/// ]); +/// assert_eq!(hay.split(&re).collect::<Vec<_>>(), vec!["a", "b", "c"]); +/// ``` +#[derive(Clone)] +pub struct Regex { + pub(crate) meta: meta::Regex, + pub(crate) pattern: Arc<str>, +} + +impl core::fmt::Display for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "{}", self.as_str()) + } +} + +impl core::fmt::Debug for Regex { + /// Shows the original regular expression. + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + f.debug_tuple("Regex").field(&self.as_str()).finish() + } +} + +impl core::str::FromStr for Regex { + type Err = Error; + + /// Attempts to parse a string into a regular expression + fn from_str(s: &str) -> Result<Regex, Error> { + Regex::new(s) + } +} + +/// Core regular expression methods. +impl Regex { + /// Compiles a regular expression. Once compiled, it can be used repeatedly + /// to search, split or replace substrings in a haystack. + /// + /// Note that regex compilation tends to be a somewhat expensive process, + /// and unlike higher level environments, compilation is not automatically + /// cached for you. One should endeavor to compile a regex once and then + /// reuse it. For example, it's a bad idea to compile the same regex + /// repeatedly in a loop. + /// + /// # Errors + /// + /// If an invalid pattern is given, then an error is returned. + /// An error is also returned if the pattern is valid, but would + /// produce a regex that is bigger than the configured size limit via + /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by + /// default.) + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// // An Invalid pattern because of an unclosed parenthesis + /// assert!(Regex::new(r"foo(bar").is_err()); + /// // An invalid pattern because the regex would be too big + /// // because Unicode tends to inflate things. + /// assert!(Regex::new(r"\w{1000}").is_err()); + /// // Disabling Unicode can make the regex much smaller, + /// // potentially by up to or more than an order of magnitude. + /// assert!(Regex::new(r"(?-u:\w){1000}").is_ok()); + /// ``` + pub fn new(re: &str) -> Result<Regex, Error> { + RegexBuilder::new(re).build() + } + + /// Returns true if and only if there is a match for the regex anywhere + /// in the haystack given. + /// + /// It is recommended to use this method if all you need to do is test + /// whether a match exists, since the underlying matching engine may be + /// able to do less work. + /// + /// # Example + /// + /// Test if some haystack contains at least one word with exactly 13 + /// Unicode word characters: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = "I categorically deny having triskaidekaphobia."; + /// assert!(re.is_match(hay)); + /// ``` + #[inline] + pub fn is_match(&self, haystack: &str) -> bool { + self.is_match_at(haystack, 0) + } + + /// This routine searches for the first match of this regex in the + /// haystack given, and if found, returns a [`Match`]. The `Match` + /// provides access to both the byte offsets of the match and the actual + /// substring that matched. + /// + /// Note that this should only be used if you want to find the entire + /// match. If instead you just want to test the existence of a match, + /// it's potentially faster to use `Regex::is_match(hay)` instead of + /// `Regex::find(hay).is_some()`. + /// + /// # Example + /// + /// Find the first word with exactly 13 Unicode word characters: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = "I categorically deny having triskaidekaphobia."; + /// let mat = re.find(hay).unwrap(); + /// assert_eq!(2..15, mat.range()); + /// assert_eq!("categorically", mat.as_str()); + /// ``` + #[inline] + pub fn find<'h>(&self, haystack: &'h str) -> Option<Match<'h>> { + self.find_at(haystack, 0) + } + + /// Returns an iterator that yields successive non-overlapping matches in + /// the given haystack. The iterator yields values of type [`Match`]. + /// + /// # Time complexity + /// + /// Note that since `find_iter` runs potentially many searches on the + /// haystack and since each search has worst case `O(m * n)` time + /// complexity, the overall worst case time complexity for iteration is + /// `O(m * n^2)`. + /// + /// # Example + /// + /// Find every word with exactly 13 Unicode word characters: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\b\w{13}\b").unwrap(); + /// let hay = "Retroactively relinquishing remunerations is reprehensible."; + /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_str()).collect(); + /// assert_eq!(matches, vec![ + /// "Retroactively", + /// "relinquishing", + /// "remunerations", + /// "reprehensible", + /// ]); + /// ``` + #[inline] + pub fn find_iter<'r, 'h>(&'r self, haystack: &'h str) -> Matches<'r, 'h> { + Matches { haystack, it: self.meta.find_iter(haystack) } + } + + /// This routine searches for the first match of this regex in the haystack + /// given, and if found, returns not only the overall match but also the + /// matches of each capture group in the regex. If no match is found, then + /// `None` is returned. + /// + /// Capture group `0` always corresponds to an implicit unnamed group that + /// includes the entire match. If a match is found, this group is always + /// present. Subsequent groups may be named and are numbered, starting + /// at 1, by the order in which the opening parenthesis appears in the + /// pattern. For example, in the pattern `(?<a>.(?<b>.))(?<c>.)`, `a`, + /// `b` and `c` correspond to capture group indices `1`, `2` and `3`, + /// respectively. + /// + /// You should only use `captures` if you need access to the capture group + /// matches. Otherwise, [`Regex::find`] is generally faster for discovering + /// just the overall match. + /// + /// # Example + /// + /// Say you have some haystack with movie names and their release years, + /// like "'Citizen Kane' (1941)". It'd be nice if we could search for + /// substrings looking like that, while also extracting the movie name and + /// its release year separately. The example below shows how to do that. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(hay).unwrap(); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); + /// // You can also access the groups by index using the Index notation. + /// // Note that this will panic on an invalid index. In this case, these + /// // accesses are always correct because the overall regex will only + /// // match when these capture groups match. + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// assert_eq!(&caps[1], "Citizen Kane"); + /// assert_eq!(&caps[2], "1941"); + /// ``` + /// + /// Note that the full match is at capture group `0`. Each subsequent + /// capture group is indexed by the order of its opening `(`. + /// + /// We can make this example a bit clearer by using *named* capture groups: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>\d{4})\)").unwrap(); + /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let caps = re.captures(hay).unwrap(); + /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); + /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane"); + /// assert_eq!(caps.name("year").unwrap().as_str(), "1941"); + /// // You can also access the groups by name using the Index notation. + /// // Note that this will panic on an invalid group name. In this case, + /// // these accesses are always correct because the overall regex will + /// // only match when these capture groups match. + /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// ``` + /// + /// Here we name the capture groups, which we can access with the `name` + /// method or the `Index` notation with a `&str`. Note that the named + /// capture groups are still accessible with `get` or the `Index` notation + /// with a `usize`. + /// + /// The `0`th capture group is always unnamed, so it must always be + /// accessed with `get(0)` or `[0]`. + /// + /// Finally, one other way to to get the matched substrings is with the + /// [`Captures::extract`] API: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); + /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; + /// let (full, [title, year]) = re.captures(hay).unwrap().extract(); + /// assert_eq!(full, "'Citizen Kane' (1941)"); + /// assert_eq!(title, "Citizen Kane"); + /// assert_eq!(year, "1941"); + /// ``` + #[inline] + pub fn captures<'h>(&self, haystack: &'h str) -> Option<Captures<'h>> { + self.captures_at(haystack, 0) + } + + /// Returns an iterator that yields successive non-overlapping matches in + /// the given haystack. The iterator yields values of type [`Captures`]. + /// + /// This is the same as [`Regex::find_iter`], but instead of only providing + /// access to the overall match, each value yield includes access to the + /// matches of all capture groups in the regex. Reporting this extra match + /// data is potentially costly, so callers should only use `captures_iter` + /// over `find_iter` when they actually need access to the capture group + /// matches. + /// + /// # Time complexity + /// + /// Note that since `captures_iter` runs potentially many searches on the + /// haystack and since each search has worst case `O(m * n)` time + /// complexity, the overall worst case time complexity for iteration is + /// `O(m * n^2)`. + /// + /// # Example + /// + /// We can use this to find all movie titles and their release years in + /// some haystack, where the movie is formatted like "'Title' (xxxx)": + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap(); + /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// let mut movies = vec![]; + /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) { + /// movies.push((title, year.parse::<i64>()?)); + /// } + /// assert_eq!(movies, vec![ + /// ("Citizen Kane", 1941), + /// ("The Wizard of Oz", 1939), + /// ("M", 1931), + /// ]); + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + /// + /// Or with named groups: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap(); + /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; + /// let mut it = re.captures_iter(hay); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], "Citizen Kane"); + /// assert_eq!(&caps["year"], "1941"); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], "The Wizard of Oz"); + /// assert_eq!(&caps["year"], "1939"); + /// + /// let caps = it.next().unwrap(); + /// assert_eq!(&caps["title"], "M"); + /// assert_eq!(&caps["year"], "1931"); + /// ``` + #[inline] + pub fn captures_iter<'r, 'h>( + &'r self, + haystack: &'h str, + ) -> CaptureMatches<'r, 'h> { + CaptureMatches { haystack, it: self.meta.captures_iter(haystack) } + } + + /// Returns an iterator of substrings of the haystack given, delimited by a + /// match of the regex. Namely, each element of the iterator corresponds to + /// a part of the haystack that *isn't* matched by the regular expression. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// # Example + /// + /// To split a string delimited by arbitrary amounts of spaces or tabs: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"[ \t]+").unwrap(); + /// let hay = "a b \t c\td e"; + /// let fields: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); + /// ``` + /// + /// # Example: more cases + /// + /// Basic usage: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = "Mary had a little lamb"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = ""; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec![""]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "lionXXtigerXleopard"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]); + /// + /// let re = Regex::new(r"::").unwrap(); + /// let hay = "lion::tiger::leopard"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["lion", "tiger", "leopard"]); + /// ``` + /// + /// If a haystack contains multiple contiguous matches, you will end up + /// with empty spans yielded by the iterator: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "XXXXaXXbXc"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); + /// + /// let re = Regex::new(r"/").unwrap(); + /// let hay = "(///)"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["(", "", "", ")"]); + /// ``` + /// + /// Separators at the start or end of a haystack are neighbored by empty + /// substring. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"0").unwrap(); + /// let hay = "010"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "1", ""]); + /// ``` + /// + /// When the empty string is used as a regex, it splits at every valid + /// UTF-8 boundary by default (which includes the beginning and end of the + /// haystack): + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let hay = "rust"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]); + /// + /// // Splitting by an empty string is UTF-8 aware by default! + /// let re = Regex::new(r"").unwrap(); + /// let hay = "☃"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "☃", ""]); + /// ``` + /// + /// Contiguous separators (commonly shows up with whitespace), can lead to + /// possibly surprising behavior. For example, this code is correct: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = " a b c"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); + /// ``` + /// + /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want + /// to match contiguous space characters: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r" +").unwrap(); + /// let hay = " a b c"; + /// let got: Vec<&str> = re.split(hay).collect(); + /// // N.B. This does still include a leading empty span because ' +' + /// // matches at the beginning of the haystack. + /// assert_eq!(got, vec!["", "a", "b", "c"]); + /// ``` + #[inline] + pub fn split<'r, 'h>(&'r self, haystack: &'h str) -> Split<'r, 'h> { + Split { haystack, it: self.meta.split(haystack) } + } + + /// Returns an iterator of at most `limit` substrings of the haystack + /// given, delimited by a match of the regex. (A `limit` of `0` will return + /// no substrings.) Namely, each element of the iterator corresponds to a + /// part of the haystack that *isn't* matched by the regular expression. + /// The remainder of the haystack that is not split will be the last + /// element in the iterator. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// Although note that the worst case time here has an upper bound given + /// by the `limit` parameter. + /// + /// # Example + /// + /// Get the first two words in some haystack: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\W+").unwrap(); + /// let hay = "Hey! How are you?"; + /// let fields: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); + /// ``` + /// + /// # Examples: more cases + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r" ").unwrap(); + /// let hay = "Mary had a little lamb"; + /// let got: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = ""; + /// let got: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec![""]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "lionXXtigerXleopard"; + /// let got: Vec<&str> = re.splitn(hay, 3).collect(); + /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]); + /// + /// let re = Regex::new(r"::").unwrap(); + /// let hay = "lion::tiger::leopard"; + /// let got: Vec<&str> = re.splitn(hay, 2).collect(); + /// assert_eq!(got, vec!["lion", "tiger::leopard"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "abcXdef"; + /// let got: Vec<&str> = re.splitn(hay, 1).collect(); + /// assert_eq!(got, vec!["abcXdef"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "abcdef"; + /// let got: Vec<&str> = re.splitn(hay, 2).collect(); + /// assert_eq!(got, vec!["abcdef"]); + /// + /// let re = Regex::new(r"X").unwrap(); + /// let hay = "abcXdef"; + /// let got: Vec<&str> = re.splitn(hay, 0).collect(); + /// assert!(got.is_empty()); + /// ``` + #[inline] + pub fn splitn<'r, 'h>( + &'r self, + haystack: &'h str, + limit: usize, + ) -> SplitN<'r, 'h> { + SplitN { haystack, it: self.meta.splitn(haystack, limit) } + } + + /// Replaces the leftmost-first match in the given haystack with the + /// replacement provided. The replacement can be a regular string (where + /// `$N` and `$name` are expanded to match capture groups) or a function + /// that takes a [`Captures`] and returns the replaced string. + /// + /// If no match is found, then the haystack is returned unchanged. In that + /// case, this implementation will likely return a `Cow::Borrowed` value + /// such that no allocation is performed. + /// + /// # Replacement string syntax + /// + /// All instances of `$ref` in the replacement string are replaced with + /// the substring corresponding to the capture group identified by `ref`. + /// + /// `ref` may be an integer corresponding to the index of the capture group + /// (counted by order of opening parenthesis where `0` is the entire match) + /// or it can be a name (consisting of letters, digits or underscores) + /// corresponding to a named capture group. + /// + /// If `ref` isn't a valid capture group (whether the name doesn't exist or + /// isn't a valid index), then it is replaced with the empty string. + /// + /// The longest possible name is used. For example, `$1a` looks up the + /// capture group named `1a` and not the capture group at index `1`. To + /// exert more precise control over the name, use braces, e.g., `${1}a`. + /// + /// To write a literal `$` use `$$`. + /// + /// # Example + /// + /// Note that this function is polymorphic with respect to the replacement. + /// In typical usage, this can just be a normal string: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"[^01]+").unwrap(); + /// assert_eq!(re.replace("1078910", ""), "1010"); + /// ``` + /// + /// But anything satisfying the [`Replacer`] trait will work. For example, + /// a closure of type `|&Captures| -> String` provides direct access to the + /// captures corresponding to a match. This allows one to access capturing + /// group matches easily: + /// + /// ``` + /// use regex::{Captures, Regex}; + /// + /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { + /// format!("{} {}", &caps[2], &caps[1]) + /// }); + /// assert_eq!(result, "Bruce Springsteen"); + /// ``` + /// + /// But this is a bit cumbersome to use all the time. Instead, a simple + /// syntax is supported (as described above) that expands `$name` into the + /// corresponding capture group. Here's the last example, but using this + /// expansion technique with named capture groups: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", "$first $last"); + /// assert_eq!(result, "Bruce Springsteen"); + /// ``` + /// + /// Note that using `$2` instead of `$first` or `$1` instead of `$last` + /// would produce the same result. To write a literal `$` use `$$`. + /// + /// Sometimes the replacement string requires use of curly braces to + /// delineate a capture group replacement when it is adjacent to some other + /// literal text. For example, if we wanted to join two words together with + /// an underscore: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap(); + /// let result = re.replace("deep fried", "${first}_$second"); + /// assert_eq!(result, "deep_fried"); + /// ``` + /// + /// Without the curly braces, the capture group name `first_` would be + /// used, and since it doesn't exist, it would be replaced with the empty + /// string. + /// + /// Finally, sometimes you just want to replace a literal string with no + /// regard for capturing group expansion. This can be done by wrapping a + /// string with [`NoExpand`]: + /// + /// ``` + /// use regex::{NoExpand, Regex}; + /// + /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); + /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); + /// assert_eq!(result, "$2 $last"); + /// ``` + /// + /// Using `NoExpand` may also be faster, since the replacement string won't + /// need to be parsed for the `$` syntax. + #[inline] + pub fn replace<'h, R: Replacer>( + &self, + haystack: &'h str, + rep: R, + ) -> Cow<'h, str> { + self.replacen(haystack, 1, rep) + } + + /// Replaces all non-overlapping matches in the haystack with the + /// replacement provided. This is the same as calling `replacen` with + /// `limit` set to `0`. + /// + /// The documentation for [`Regex::replace`] goes into more detail about + /// what kinds of replacement strings are supported. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// # Fallibility + /// + /// If you need to write a replacement routine where any individual + /// replacement might "fail," doing so with this API isn't really feasible + /// because there's no way to stop the search process if a replacement + /// fails. Instead, if you need this functionality, you should consider + /// implementing your own replacement routine: + /// + /// ``` + /// use regex::{Captures, Regex}; + /// + /// fn replace_all<E>( + /// re: &Regex, + /// haystack: &str, + /// replacement: impl Fn(&Captures) -> Result<String, E>, + /// ) -> Result<String, E> { + /// let mut new = String::with_capacity(haystack.len()); + /// let mut last_match = 0; + /// for caps in re.captures_iter(haystack) { + /// let m = caps.get(0).unwrap(); + /// new.push_str(&haystack[last_match..m.start()]); + /// new.push_str(&replacement(&caps)?); + /// last_match = m.end(); + /// } + /// new.push_str(&haystack[last_match..]); + /// Ok(new) + /// } + /// + /// // Let's replace each word with the number of bytes in that word. + /// // But if we see a word that is "too long," we'll give up. + /// let re = Regex::new(r"\w+").unwrap(); + /// let replacement = |caps: &Captures| -> Result<String, &'static str> { + /// if caps[0].len() >= 5 { + /// return Err("word too long"); + /// } + /// Ok(caps[0].len().to_string()) + /// }; + /// assert_eq!( + /// Ok("2 3 3 3?".to_string()), + /// replace_all(&re, "hi how are you?", &replacement), + /// ); + /// assert!(replace_all(&re, "hi there", &replacement).is_err()); + /// ``` + /// + /// # Example + /// + /// This example shows how to flip the order of whitespace (excluding line + /// terminators) delimited fields, and normalizes the whitespace that + /// delimits the fields: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap(); + /// let hay = " + /// Greetings 1973 + /// Wild\t1973 + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "; + /// let new = re.replace_all(hay, "$2 $1"); + /// assert_eq!(new, " + /// 1973 Greetings + /// 1973 Wild + /// 1975 BornToRun + /// 1978 Darkness + /// 1980 TheRiver + /// "); + /// ``` + #[inline] + pub fn replace_all<'h, R: Replacer>( + &self, + haystack: &'h str, + rep: R, + ) -> Cow<'h, str> { + self.replacen(haystack, 0, rep) + } + + /// Replaces at most `limit` non-overlapping matches in the haystack with + /// the replacement provided. If `limit` is `0`, then all non-overlapping + /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is + /// equivalent to `Regex::replacen(hay, 0, rep)`. + /// + /// The documentation for [`Regex::replace`] goes into more detail about + /// what kinds of replacement strings are supported. + /// + /// # Time complexity + /// + /// Since iterators over all matches requires running potentially many + /// searches on the haystack, and since each search has worst case + /// `O(m * n)` time complexity, the overall worst case time complexity for + /// this routine is `O(m * n^2)`. + /// + /// Although note that the worst case time here has an upper bound given + /// by the `limit` parameter. + /// + /// # Fallibility + /// + /// See the corresponding section in the docs for [`Regex::replace_all`] + /// for tips on how to deal with a replacement routine that can fail. + /// + /// # Example + /// + /// This example shows how to flip the order of whitespace (excluding line + /// terminators) delimited fields, and normalizes the whitespace that + /// delimits the fields. But we only do it for the first two matches. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(?m)^(\S+)[\s--\r\n]+(\S+)$").unwrap(); + /// let hay = " + /// Greetings 1973 + /// Wild\t1973 + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "; + /// let new = re.replacen(hay, 2, "$2 $1"); + /// assert_eq!(new, " + /// 1973 Greetings + /// 1973 Wild + /// BornToRun\t\t\t\t1975 + /// Darkness 1978 + /// TheRiver 1980 + /// "); + /// ``` + #[inline] + pub fn replacen<'h, R: Replacer>( + &self, + haystack: &'h str, + limit: usize, + mut rep: R, + ) -> Cow<'h, str> { + // If we know that the replacement doesn't have any capture expansions, + // then we can use the fast path. The fast path can make a tremendous + // difference: + // + // 1) We use `find_iter` instead of `captures_iter`. Not asking for + // captures generally makes the regex engines faster. + // 2) We don't need to look up all of the capture groups and do + // replacements inside the replacement string. We just push it + // at each match and be done with it. + if let Some(rep) = rep.no_expansion() { + let mut it = self.find_iter(haystack).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(haystack); + } + let mut new = String::with_capacity(haystack.len()); + let mut last_match = 0; + for (i, m) in it { + new.push_str(&haystack[last_match..m.start()]); + new.push_str(&rep); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.push_str(&haystack[last_match..]); + return Cow::Owned(new); + } + + // The slower path, which we use if the replacement may need access to + // capture groups. + let mut it = self.captures_iter(haystack).enumerate().peekable(); + if it.peek().is_none() { + return Cow::Borrowed(haystack); + } + let mut new = String::with_capacity(haystack.len()); + let mut last_match = 0; + for (i, cap) in it { + // unwrap on 0 is OK because captures only reports matches + let m = cap.get(0).unwrap(); + new.push_str(&haystack[last_match..m.start()]); + rep.replace_append(&cap, &mut new); + last_match = m.end(); + if limit > 0 && i >= limit - 1 { + break; + } + } + new.push_str(&haystack[last_match..]); + Cow::Owned(new) + } +} + +/// A group of advanced or "lower level" search methods. Some methods permit +/// starting the search at a position greater than `0` in the haystack. Other +/// methods permit reusing allocations, for example, when extracting the +/// matches for capture groups. +impl Regex { + /// Returns the end byte offset of the first match in the haystack given. + /// + /// This method may have the same performance characteristics as + /// `is_match`. Behaviorlly, it doesn't just report whether it match + /// occurs, but also the end offset for a match. In particular, the offset + /// returned *may be shorter* than the proper end of the leftmost-first + /// match that you would find via [`Regex::find`]. + /// + /// Note that it is not guaranteed that this routine finds the shortest or + /// "earliest" possible match. Instead, the main idea of this API is that + /// it returns the offset at the point at which the internal regex engine + /// has determined that a match has occurred. This may vary depending on + /// which internal regex engine is used, and thus, the offset itself may + /// change based on internal heuristics. + /// + /// # Example + /// + /// Typically, `a+` would match the entire first sequence of `a` in some + /// haystack, but `shortest_match` *may* give up as soon as it sees the + /// first `a`. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"a+").unwrap(); + /// let offset = re.shortest_match("aaaaa").unwrap(); + /// assert_eq!(offset, 1); + /// ``` + #[inline] + pub fn shortest_match(&self, haystack: &str) -> Option<usize> { + self.shortest_match_at(haystack, 0) + } + + /// Returns the same as [`Regex::shortest_match`], but starts the search at + /// the given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only match + /// when `start == 0`. + /// + /// If a match is found, the offset returned is relative to the beginning + /// of the haystack, not the beginning of the search. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(re.shortest_match(&hay[2..]), Some(4)); + /// // No match because the assertions take the context into account. + /// assert_eq!(re.shortest_match_at(hay, 2), None); + /// ``` + #[inline] + pub fn shortest_match_at( + &self, + haystack: &str, + start: usize, + ) -> Option<usize> { + let input = + Input::new(haystack).earliest(true).span(start..haystack.len()); + self.meta.search_half(&input).map(|hm| hm.offset()) + } + + /// Returns the same as [`Regex::is_match`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert!(re.is_match(&hay[2..])); + /// // No match because the assertions take the context into account. + /// assert!(!re.is_match_at(hay, 2)); + /// ``` + #[inline] + pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { + let input = + Input::new(haystack).earliest(true).span(start..haystack.len()); + self.meta.search_half(&input).is_some() + } + + /// Returns the same as [`Regex::find`], but starts the search at the given + /// offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4)); + /// // No match because the assertions take the context into account. + /// assert_eq!(re.find_at(hay, 2), None); + /// ``` + #[inline] + pub fn find_at<'h>( + &self, + haystack: &'h str, + start: usize, + ) -> Option<Match<'h>> { + let input = Input::new(haystack).span(start..haystack.len()); + self.meta + .search(&input) + .map(|m| Match::new(haystack, m.start(), m.end())) + } + + /// Returns the same as [`Regex::captures`], but starts the search at the + /// given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// // We get a match here, but it's probably not intended. + /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], "chew"); + /// // No match because the assertions take the context into account. + /// assert!(re.captures_at(hay, 2).is_none()); + /// ``` + #[inline] + pub fn captures_at<'h>( + &self, + haystack: &'h str, + start: usize, + ) -> Option<Captures<'h>> { + let input = Input::new(haystack).span(start..haystack.len()); + let mut caps = self.meta.create_captures(); + self.meta.search_captures(&input, &mut caps); + if caps.is_match() { + let static_captures_len = self.static_captures_len(); + Some(Captures { haystack, caps, static_captures_len }) + } else { + None + } + } + + /// This is like [`Regex::captures`], but writes the byte offsets of each + /// capture group match into the locations given. + /// + /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], + /// but does *not* store a reference to the haystack. This makes its API + /// a bit lower level and less convenience. But in exchange, callers + /// may allocate their own `CaptureLocations` and reuse it for multiple + /// searches. This may be helpful if allocating a `Captures` shows up in a + /// profile as too costly. + /// + /// To create a `CaptureLocations` value, use the + /// [`Regex::capture_locations`] method. + /// + /// This also the overall match if one was found. When a match is found, + /// its offsets are also always stored in `locs` at index `0`. + /// + /// # Panics + /// + /// This routine may panic if the given `CaptureLocations` was not created + /// by this regex. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert!(re.captures_read(&mut locs, "id=foo123").is_some()); + /// assert_eq!(Some((0, 9)), locs.get(0)); + /// assert_eq!(Some((0, 2)), locs.get(1)); + /// assert_eq!(Some((3, 9)), locs.get(2)); + /// ``` + #[inline] + pub fn captures_read<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h str, + ) -> Option<Match<'h>> { + self.captures_read_at(locs, haystack, 0) + } + + /// Returns the same as [`Regex::captures_read`], but starts the search at + /// the given offset. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// This routine may also panic if the given `CaptureLocations` was not + /// created by this regex. + /// + /// # Example + /// + /// This example shows the significance of `start` by demonstrating how it + /// can be used to permit look-around assertions in a regex to take the + /// surrounding context into account. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"\bchew\b").unwrap(); + /// let hay = "eschew"; + /// let mut locs = re.capture_locations(); + /// // We get a match here, but it's probably not intended. + /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some()); + /// // No match because the assertions take the context into account. + /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none()); + /// ``` + #[inline] + pub fn captures_read_at<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h str, + start: usize, + ) -> Option<Match<'h>> { + let input = Input::new(haystack).span(start..haystack.len()); + self.meta.search_captures(&input, &mut locs.0); + locs.0.get_match().map(|m| Match::new(haystack, m.start(), m.end())) + } + + /// An undocumented alias for `captures_read_at`. + /// + /// The `regex-capi` crate previously used this routine, so to avoid + /// breaking that crate, we continue to provide the name as an undocumented + /// alias. + #[doc(hidden)] + #[inline] + pub fn read_captures_at<'h>( + &self, + locs: &mut CaptureLocations, + haystack: &'h str, + start: usize, + ) -> Option<Match<'h>> { + self.captures_read_at(locs, haystack, start) + } +} + +/// Auxiliary methods. +impl Regex { + /// Returns the original string of this regex. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"foo\w+bar").unwrap(); + /// assert_eq!(re.as_str(), r"foo\w+bar"); + /// ``` + #[inline] + pub fn as_str(&self) -> &str { + &self.pattern + } + + /// Returns an iterator over the capture names in this regex. + /// + /// The iterator returned yields elements of type `Option<&str>`. That is, + /// the iterator yields values for all capture groups, even ones that are + /// unnamed. The order of the groups corresponds to the order of the group's + /// corresponding opening parenthesis. + /// + /// The first element of the iterator always yields the group corresponding + /// to the overall match, and this group is always unnamed. Therefore, the + /// iterator always yields at least one group. + /// + /// # Example + /// + /// This shows basic usage with a mix of named and unnamed capture groups: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), Some(Some("a"))); + /// assert_eq!(names.next(), Some(Some("b"))); + /// assert_eq!(names.next(), Some(None)); + /// // the '(?:.)' group is non-capturing and so doesn't appear here! + /// assert_eq!(names.next(), Some(Some("c"))); + /// assert_eq!(names.next(), None); + /// ``` + /// + /// The iterator always yields at least one element, even for regexes with + /// no capture groups and even for regexes that can never match: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), None); + /// + /// let re = Regex::new(r"[a&&b]").unwrap(); + /// let mut names = re.capture_names(); + /// assert_eq!(names.next(), Some(None)); + /// assert_eq!(names.next(), None); + /// ``` + #[inline] + pub fn capture_names(&self) -> CaptureNames<'_> { + CaptureNames(self.meta.group_info().pattern_names(PatternID::ZERO)) + } + + /// Returns the number of captures groups in this regex. + /// + /// This includes all named and unnamed groups, including the implicit + /// unnamed group that is always present and corresponds to the entire + /// match. + /// + /// Since the implict unnamed group is always included in this length, the + /// length returned is guaranteed to be greater than zero. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"foo").unwrap(); + /// assert_eq!(1, re.captures_len()); + /// + /// let re = Regex::new(r"(foo)").unwrap(); + /// assert_eq!(2, re.captures_len()); + /// + /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); + /// assert_eq!(5, re.captures_len()); + /// + /// let re = Regex::new(r"[a&&b]").unwrap(); + /// assert_eq!(1, re.captures_len()); + /// ``` + #[inline] + pub fn captures_len(&self) -> usize { + self.meta.group_info().group_len(PatternID::ZERO) + } + + /// Returns the total number of capturing groups that appear in every + /// possible match. + /// + /// If the number of capture groups can vary depending on the match, then + /// this returns `None`. That is, a value is only returned when the number + /// of matching groups is invariant or "static." + /// + /// Note that like [`Regex::captures_len`], this **does** include the + /// implicit capturing group corresponding to the entire match. Therefore, + /// when a non-None value is returned, it is guaranteed to be at least `1`. + /// Stated differently, a return value of `Some(0)` is impossible. + /// + /// # Example + /// + /// This shows a few cases where a static number of capture groups is + /// available and a few cases where it is not. + /// + /// ``` + /// use regex::Regex; + /// + /// let len = |pattern| { + /// Regex::new(pattern).map(|re| re.static_captures_len()) + /// }; + /// + /// assert_eq!(Some(1), len("a")?); + /// assert_eq!(Some(2), len("(a)")?); + /// assert_eq!(Some(2), len("(a)|(b)")?); + /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); + /// assert_eq!(None, len("(a)|b")?); + /// assert_eq!(None, len("a|(b)")?); + /// assert_eq!(None, len("(b)*")?); + /// assert_eq!(Some(2), len("(b)+")?); + /// + /// # Ok::<(), Box<dyn std::error::Error>>(()) + /// ``` + #[inline] + pub fn static_captures_len(&self) -> Option<usize> { + self.meta.static_captures_len() + } + + /// Returns a fresh allocated set of capture locations that can + /// be reused in multiple calls to [`Regex::captures_read`] or + /// [`Regex::captures_read_at`]. + /// + /// The returned locations can be used for any subsequent search for this + /// particular regex. There is no guarantee that it is correct to use for + /// other regexes, even if they have the same number of capture groups. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(.)(.)(\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert!(re.captures_read(&mut locs, "Padron").is_some()); + /// assert_eq!(locs.get(0), Some((0, 6))); + /// assert_eq!(locs.get(1), Some((0, 1))); + /// assert_eq!(locs.get(2), Some((1, 2))); + /// assert_eq!(locs.get(3), Some((2, 6))); + /// ``` + #[inline] + pub fn capture_locations(&self) -> CaptureLocations { + CaptureLocations(self.meta.create_captures()) + } + + /// An alias for `capture_locations` to preserve backward compatibility. + /// + /// The `regex-capi` crate used this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + #[inline] + pub fn locations(&self) -> CaptureLocations { + self.capture_locations() + } +} + +/// Represents a single match of a regex in a haystack. +/// +/// A `Match` contains both the start and end byte offsets of the match and the +/// actual substring corresponding to the range of those byte offsets. It is +/// guaranteed that `start <= end`. When `start == end`, the match is empty. +/// +/// Since this `Match` can only be produced by the top-level `Regex` APIs +/// that only support searching UTF-8 encoded strings, the byte offsets for a +/// `Match` are guaranteed to fall on valid UTF-8 codepoint boundaries. That +/// is, slicing a `&str` with [`Match::range`] is guaranteed to never panic. +/// +/// Values with this type are created by [`Regex::find`] or +/// [`Regex::find_iter`]. Other APIs can create `Match` values too. For +/// example, [`Captures::get`]. +/// +/// The lifetime parameter `'h` refers to the lifetime of the matched of the +/// haystack that this match was produced from. +/// +/// # Numbering +/// +/// The byte offsets in a `Match` form a half-open interval. That is, the +/// start of the range is inclusive and the end of the range is exclusive. +/// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte +/// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and +/// `6` corresponds to `x`, which is one past the end of the match. This +/// corresponds to the same kind of slicing that Rust uses. +/// +/// For more on why this was chosen over other schemes (aside from being +/// consistent with how Rust the language works), see [this discussion] and +/// [Dijkstra's note on a related topic][note]. +/// +/// [this discussion]: https://github.com/rust-lang/regex/discussions/866 +/// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html +/// +/// # Example +/// +/// This example shows the value of each of the methods on `Match` for a +/// particular search. +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"\p{Greek}+").unwrap(); +/// let hay = "Greek: αβγδ"; +/// let m = re.find(hay).unwrap(); +/// assert_eq!(7, m.start()); +/// assert_eq!(15, m.end()); +/// assert!(!m.is_empty()); +/// assert_eq!(8, m.len()); +/// assert_eq!(7..15, m.range()); +/// assert_eq!("αβγδ", m.as_str()); +/// ``` +#[derive(Copy, Clone, Eq, PartialEq)] +pub struct Match<'h> { + haystack: &'h str, + start: usize, + end: usize, +} + +impl<'h> Match<'h> { + /// Returns the byte offset of the start of the match in the haystack. The + /// start of the match corresponds to the position where the match begins + /// and includes the first byte in the match. + /// + /// It is guaranteed that `Match::start() <= Match::end()`. + /// + /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That + /// is, it will never be an offset that appears between the UTF-8 code + /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is + /// always safe to slice the corresponding haystack using this offset. + #[inline] + pub fn start(&self) -> usize { + self.start + } + + /// Returns the byte offset of the end of the match in the haystack. The + /// end of the match corresponds to the byte immediately following the last + /// byte in the match. This means that `&slice[start..end]` works as one + /// would expect. + /// + /// It is guaranteed that `Match::start() <= Match::end()`. + /// + /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That + /// is, it will never be an offset that appears between the UTF-8 code + /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is + /// always safe to slice the corresponding haystack using this offset. + #[inline] + pub fn end(&self) -> usize { + self.end + } + + /// Returns true if and only if this match has a length of zero. + /// + /// Note that an empty match can only occur when the regex itself can + /// match the empty string. Here are some examples of regexes that can + /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`, + /// `(foo|\d+|quux)?`. + #[inline] + pub fn is_empty(&self) -> bool { + self.start == self.end + } + + /// Returns the length, in bytes, of this match. + #[inline] + pub fn len(&self) -> usize { + self.end - self.start + } + + /// Returns the range over the starting and ending byte offsets of the + /// match in the haystack. + /// + /// It is always correct to slice the original haystack searched with this + /// range. That is, because the offsets are guaranteed to fall on valid + /// UTF-8 boundaries, the range returned is always valid. + #[inline] + pub fn range(&self) -> core::ops::Range<usize> { + self.start..self.end + } + + /// Returns the substring of the haystack that matched. + #[inline] + pub fn as_str(&self) -> &'h str { + &self.haystack[self.range()] + } + + /// Creates a new match from the given haystack and byte offsets. + #[inline] + fn new(haystack: &'h str, start: usize, end: usize) -> Match<'h> { + Match { haystack, start, end } + } +} + +impl<'h> core::fmt::Debug for Match<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + f.debug_struct("Match") + .field("start", &self.start) + .field("end", &self.end) + .field("string", &self.as_str()) + .finish() + } +} + +impl<'h> From<Match<'h>> for &'h str { + fn from(m: Match<'h>) -> &'h str { + m.as_str() + } +} + +impl<'h> From<Match<'h>> for core::ops::Range<usize> { + fn from(m: Match<'h>) -> core::ops::Range<usize> { + m.range() + } +} + +/// Represents the capture groups for a single match. +/// +/// Capture groups refer to parts of a regex enclosed in parentheses. They can +/// be optionally named. The purpose of capture groups is to be able to +/// reference different parts of a match based on the original pattern. For +/// example, say you want to match the individual letters in a 5-letter word: +/// +/// ```text +/// (?<first>\w)(\w)(?:\w)\w(?<last>\w) +/// ``` +/// +/// This regex has 4 capture groups: +/// +/// * The group at index `0` corresponds to the overall match. It is always +/// present in every match and never has a name. +/// * The group at index `1` with name `first` corresponding to the first +/// letter. +/// * The group at index `2` with no name corresponding to the second letter. +/// * The group at index `3` with name `last` corresponding to the fifth and +/// last letter. +/// +/// Notice that `(?:\w)` was not listed above as a capture group despite it +/// being enclosed in parentheses. That's because `(?:pattern)` is a special +/// syntax that permits grouping but *without* capturing. The reason for not +/// treating it as a capture is that tracking and reporting capture groups +/// requires additional state that may lead to slower searches. So using as few +/// capture groups as possible can help performance. (Although the difference +/// in performance of a couple of capture groups is likely immaterial.) +/// +/// Values with this type are created by [`Regex::captures`] or +/// [`Regex::captures_iter`]. +/// +/// `'h` is the lifetime of the haystack that these captures were matched from. +/// +/// # Example +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap(); +/// let caps = re.captures("toady").unwrap(); +/// assert_eq!("toady", &caps[0]); +/// assert_eq!("t", &caps["first"]); +/// assert_eq!("o", &caps[2]); +/// assert_eq!("y", &caps["last"]); +/// ``` +pub struct Captures<'h> { + haystack: &'h str, + caps: captures::Captures, + static_captures_len: Option<usize>, +} + +impl<'h> Captures<'h> { + /// Returns the `Match` associated with the capture group at index `i`. If + /// `i` does not correspond to a capture group, or if the capture group did + /// not participate in the match, then `None` is returned. + /// + /// When `i == 0`, this is guaranteed to return a non-`None` value. + /// + /// # Examples + /// + /// Get the substring that matched with a default of an empty string if the + /// group didn't participate in the match: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let substr1 = caps.get(1).map_or("", |m| m.as_str()); + /// let substr2 = caps.get(2).map_or("", |m| m.as_str()); + /// assert_eq!(substr1, "123"); + /// assert_eq!(substr2, ""); + /// ``` + #[inline] + pub fn get(&self, i: usize) -> Option<Match<'h>> { + self.caps + .get_group(i) + .map(|sp| Match::new(self.haystack, sp.start, sp.end)) + } + + /// Returns the `Match` associated with the capture group named `name`. If + /// `name` isn't a valid capture group or it refers to a group that didn't + /// match, then `None` is returned. + /// + /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime + /// matches the lifetime of the haystack in this `Captures` value. + /// Conversely, the substring returned by `caps["name"]` has a lifetime + /// of the `Captures` value, which is likely shorter than the lifetime of + /// the haystack. In some cases, it may be necessary to use this method to + /// access the matching substring instead of the `caps["name"]` notation. + /// + /// # Examples + /// + /// Get the substring that matched with a default of an empty string if the + /// group didn't participate in the match: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new( + /// r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))", + /// ).unwrap(); + /// let caps = re.captures("abc123").unwrap(); + /// + /// let numbers = caps.name("numbers").map_or("", |m| m.as_str()); + /// let letters = caps.name("letters").map_or("", |m| m.as_str()); + /// assert_eq!(numbers, "123"); + /// assert_eq!(letters, ""); + /// ``` + #[inline] + pub fn name(&self, name: &str) -> Option<Match<'h>> { + self.caps + .get_group_by_name(name) + .map(|sp| Match::new(self.haystack, sp.start, sp.end)) + } + + /// This is a convenience routine for extracting the substrings + /// corresponding to matching capture groups. + /// + /// This returns a tuple where the first element corresponds to the full + /// substring of the haystack that matched the regex. The second element is + /// an array of substrings, with each corresponding to the to the substring + /// that matched for a particular capture group. + /// + /// # Panics + /// + /// This panics if the number of possible matching groups in this + /// `Captures` value is not fixed to `N` in all circumstances. + /// More precisely, this routine only works when `N` is equivalent to + /// [`Regex::static_captures_len`]. + /// + /// Stated more plainly, if the number of matching capture groups in a + /// regex can vary from match to match, then this function always panics. + /// + /// For example, `(a)(b)|(c)` could produce two matching capture groups + /// or one matching capture group for any given match. Therefore, one + /// cannot use `extract` with such a pattern. + /// + /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because + /// the number of capture groups in every match is always equivalent, + /// even if the capture _indices_ in each match are not. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); + /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; + /// let Some((full, [year, month, day])) = + /// re.captures(hay).map(|caps| caps.extract()) else { return }; + /// assert_eq!("2010-03-14", full); + /// assert_eq!("2010", year); + /// assert_eq!("03", month); + /// assert_eq!("14", day); + /// ``` + /// + /// # Example: iteration + /// + /// This example shows how to use this method when iterating over all + /// `Captures` matches in a haystack. + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); + /// let hay = "1973-01-05, 1975-08-25 and 1980-10-18"; + /// + /// let mut dates: Vec<(&str, &str, &str)> = vec![]; + /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) { + /// dates.push((y, m, d)); + /// } + /// assert_eq!(dates, vec![ + /// ("1973", "01", "05"), + /// ("1975", "08", "25"), + /// ("1980", "10", "18"), + /// ]); + /// ``` + /// + /// # Example: parsing different formats + /// + /// This API is particularly useful when you need to extract a particular + /// value that might occur in a different format. Consider, for example, + /// an identifier that might be in double quotes or single quotes: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap(); + /// let hay = r#"The first is id:"foo" and the second is id:'bar'."#; + /// let mut ids = vec![]; + /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) { + /// ids.push(id); + /// } + /// assert_eq!(ids, vec!["foo", "bar"]); + /// ``` + pub fn extract<const N: usize>(&self) -> (&'h str, [&'h str; N]) { + let len = self + .static_captures_len + .expect("number of capture groups can vary in a match") + .checked_sub(1) + .expect("number of groups is always greater than zero"); + assert_eq!(N, len, "asked for {} groups, but must ask for {}", N, len); + // The regex-automata variant of extract is a bit more permissive. + // It doesn't require the number of matching capturing groups to be + // static, and you can even request fewer groups than what's there. So + // this is guaranteed to never panic because we've asserted above that + // the user has requested precisely the number of groups that must be + // present in any match for this regex. + self.caps.extract(self.haystack) + } + + /// Expands all instances of `$ref` in `replacement` to the corresponding + /// capture group, and writes them to the `dst` buffer given. A `ref` can + /// be a capture group index or a name. If `ref` doesn't refer to a capture + /// group that participated in the match, then it is replaced with the + /// empty string. + /// + /// # Format + /// + /// The format of the replacement string supports two different kinds of + /// capture references: unbraced and braced. + /// + /// For the unbraced format, the format supported is `$ref` where `name` + /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always + /// the longest possible parse. So for example, `$1a` corresponds to the + /// capture group named `1a` and not the capture group at index `1`. If + /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index + /// itself and not a name. + /// + /// For the braced format, the format supported is `${ref}` where `ref` can + /// be any sequence of bytes except for `}`. If no closing brace occurs, + /// then it is not considered a capture reference. As with the unbraced + /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture + /// group index and not a name. + /// + /// The braced format is useful for exerting precise control over the name + /// of the capture reference. For example, `${1}a` corresponds to the + /// capture group reference `1` followed by the letter `a`, where as `$1a` + /// (as mentioned above) corresponds to the capture group reference `1a`. + /// The braced format is also useful for expressing capture group names + /// that use characters not supported by the unbraced format. For example, + /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`. + /// + /// If a capture group reference is found and it does not refer to a valid + /// capture group, then it will be replaced with the empty string. + /// + /// To write a literal `$`, use `$$`. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new( + /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", + /// ).unwrap(); + /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; + /// let caps = re.captures(hay).unwrap(); + /// + /// let mut dst = String::new(); + /// caps.expand("year=$year, month=$month, day=$day", &mut dst); + /// assert_eq!(dst, "year=2010, month=03, day=14"); + /// ``` + #[inline] + pub fn expand(&self, replacement: &str, dst: &mut String) { + self.caps.interpolate_string_into(self.haystack, replacement, dst); + } + + /// Returns an iterator over all capture groups. This includes both + /// matching and non-matching groups. + /// + /// The iterator always yields at least one matching group: the first group + /// (at index `0`) with no name. Subsequent groups are returned in the order + /// of their opening parenthesis in the regex. + /// + /// The elements yielded have type `Option<Match<'h>>`, where a non-`None` + /// value is present if the capture group matches. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); + /// let caps = re.captures("AZ").unwrap(); + /// + /// let mut it = caps.iter(); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("AZ")); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("A")); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), None); + /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("Z")); + /// assert_eq!(it.next(), None); + /// ``` + #[inline] + pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> { + SubCaptureMatches { haystack: self.haystack, it: self.caps.iter() } + } + + /// Returns the total number of capture groups. This includes both + /// matching and non-matching groups. + /// + /// The length returned is always equivalent to the number of elements + /// yielded by [`Captures::iter`]. Consequently, the length is always + /// greater than zero since every `Captures` value always includes the + /// match for the entire regex. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); + /// let caps = re.captures("AZ").unwrap(); + /// assert_eq!(caps.len(), 4); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.caps.group_len() + } +} + +impl<'h> core::fmt::Debug for Captures<'h> { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + /// A little helper type to provide a nice map-like debug + /// representation for our capturing group spans. + /// + /// regex-automata has something similar, but it includes the pattern + /// ID in its debug output, which is confusing. It also doesn't include + /// that strings that match because a regex-automata `Captures` doesn't + /// borrow the haystack. + struct CapturesDebugMap<'a> { + caps: &'a Captures<'a>, + } + + impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + let mut map = f.debug_map(); + let names = + self.caps.caps.group_info().pattern_names(PatternID::ZERO); + for (group_index, maybe_name) in names.enumerate() { + let key = Key(group_index, maybe_name); + match self.caps.get(group_index) { + None => map.entry(&key, &None::<()>), + Some(mat) => map.entry(&key, &Value(mat)), + }; + } + map.finish() + } + } + + struct Key<'a>(usize, Option<&'a str>); + + impl<'a> core::fmt::Debug for Key<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!(f, "{}", self.0)?; + if let Some(name) = self.1 { + write!(f, "/{:?}", name)?; + } + Ok(()) + } + } + + struct Value<'a>(Match<'a>); + + impl<'a> core::fmt::Debug for Value<'a> { + fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { + write!( + f, + "{}..{}/{:?}", + self.0.start(), + self.0.end(), + self.0.as_str() + ) + } + } + + f.debug_tuple("Captures") + .field(&CapturesDebugMap { caps: self }) + .finish() + } +} + +/// Get a matching capture group's haystack substring by index. +/// +/// The haystack substring returned can't outlive the `Captures` object if this +/// method is used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it). To work around this limitation, do that, use +/// [`Captures::get`] instead. +/// +/// `'h` is the lifetime of the matched haystack, but the lifetime of the +/// `&str` returned by this implementation is the lifetime of the `Captures` +/// value itself. +/// +/// # Panics +/// +/// If there is no matching group at the given index. +impl<'h> core::ops::Index<usize> for Captures<'h> { + type Output = str; + + // The lifetime is written out to make it clear that the &str returned + // does NOT have a lifetime equivalent to 'h. + fn index<'a>(&'a self, i: usize) -> &'a str { + self.get(i) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group at index '{}'", i)) + } +} + +/// Get a matching capture group's haystack substring by name. +/// +/// The haystack substring returned can't outlive the `Captures` object if this +/// method is used, because of how `Index` is defined (normally `a[i]` is part +/// of `a` and can't outlive it). To work around this limitation, do that, use +/// [`Captures::get`] instead. +/// +/// `'h` is the lifetime of the matched haystack, but the lifetime of the +/// `&str` returned by this implementation is the lifetime of the `Captures` +/// value itself. +/// +/// `'n` is the lifetime of the group name used to index the `Captures` value. +/// +/// # Panics +/// +/// If there is no matching group at the given name. +impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { + type Output = str; + + fn index<'a>(&'a self, name: &'n str) -> &'a str { + self.name(name) + .map(|m| m.as_str()) + .unwrap_or_else(|| panic!("no group named '{}'", name)) + } +} + +/// A low level representation of the byte offsets of each capture group. +/// +/// You can think of this as a lower level [`Captures`], where this type does +/// not support named capturing groups directly and it does not borrow the +/// haystack that these offsets were matched on. +/// +/// Primarily, this type is useful when using the lower level `Regex` APIs such +/// as [`Regex::captures_read`], which permits amortizing the allocation in +/// which capture match offsets are stored. +/// +/// In order to build a value of this type, you'll need to call the +/// [`Regex::capture_locations`] method. The value returned can then be reused +/// in subsequent searches for that regex. Using it for other regexes may +/// result in a panic or otherwise incorrect results. +/// +/// # Example +/// +/// This example shows how to create and use `CaptureLocations` in a search. +/// +/// ``` +/// use regex::Regex; +/// +/// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); +/// let mut locs = re.capture_locations(); +/// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); +/// assert_eq!(0..17, m.range()); +/// assert_eq!(Some((0, 17)), locs.get(0)); +/// assert_eq!(Some((0, 5)), locs.get(1)); +/// assert_eq!(Some((6, 17)), locs.get(2)); +/// +/// // Asking for an invalid capture group always returns None. +/// assert_eq!(None, locs.get(3)); +/// assert_eq!(None, locs.get(34973498648)); +/// assert_eq!(None, locs.get(9944060567225171988)); +/// ``` +#[derive(Clone, Debug)] +pub struct CaptureLocations(captures::Captures); + +/// A type alias for `CaptureLocations` for backwards compatibility. +/// +/// Previously, we exported `CaptureLocations` as `Locations` in an +/// undocumented API. To prevent breaking that code (e.g., in `regex-capi`), +/// we continue re-exporting the same undocumented API. +#[doc(hidden)] +pub type Locations = CaptureLocations; + +impl CaptureLocations { + /// Returns the start and end byte offsets of the capture group at index + /// `i`. This returns `None` if `i` is not a valid capture group or if the + /// capture group did not match. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); + /// assert_eq!(Some((0, 17)), locs.get(0)); + /// assert_eq!(Some((0, 5)), locs.get(1)); + /// assert_eq!(Some((6, 17)), locs.get(2)); + /// ``` + #[inline] + pub fn get(&self, i: usize) -> Option<(usize, usize)> { + self.0.get_group(i).map(|sp| (sp.start, sp.end)) + } + + /// Returns the total number of capture groups (even if they didn't match). + /// That is, the length returned is unaffected by the result of a search. + /// + /// This is always at least `1` since every regex has at least `1` + /// capturing group that corresponds to the entire match. + /// + /// # Example + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); + /// let mut locs = re.capture_locations(); + /// assert_eq!(3, locs.len()); + /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); + /// assert_eq!(3, locs.len()); + /// ``` + /// + /// Notice that the length is always at least `1`, regardless of the regex: + /// + /// ``` + /// use regex::Regex; + /// + /// let re = Regex::new(r"").unwrap(); + /// let locs = re.capture_locations(); + /// assert_eq!(1, locs.len()); + /// + /// // [a&&b] is a regex that never matches anything. + /// let re = Regex::new(r"[a&&b]").unwrap(); + /// let locs = re.capture_locations(); + /// assert_eq!(1, locs.len()); + /// ``` + #[inline] + pub fn len(&self) -> usize { + // self.0.group_len() returns 0 if the underlying captures doesn't + // represent a match, but the behavior guaranteed for this method is + // that the length doesn't change based on a match or not. + self.0.group_info().group_len(PatternID::ZERO) + } + + /// An alias for the `get` method for backwards compatibility. + /// + /// Previously, we exported `get` as `pos` in an undocumented API. To + /// prevent breaking that code (e.g., in `regex-capi`), we continue + /// re-exporting the same undocumented API. + #[doc(hidden)] + #[inline] + pub fn pos(&self, i: usize) -> Option<(usize, usize)> { + self.get(i) + } +} + +/// An iterator over all non-overlapping matches in a haystack. +/// +/// This iterator yields [`Match`] values. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the haystack. +/// +/// This iterator is created by [`Regex::find_iter`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +#[derive(Debug)] +pub struct Matches<'r, 'h> { + haystack: &'h str, + it: meta::FindMatches<'r, 'h>, +} + +impl<'r, 'h> Iterator for Matches<'r, 'h> { + type Item = Match<'h>; + + #[inline] + fn next(&mut self) -> Option<Match<'h>> { + self.it + .next() + .map(|sp| Match::new(self.haystack, sp.start(), sp.end())) + } + + #[inline] + fn count(self) -> usize { + // This can actually be up to 2x faster than calling `next()` until + // completion, because counting matches when using a DFA only requires + // finding the end of each match. But returning a `Match` via `next()` + // requires the start of each match which, with a DFA, requires a + // reverse forward scan to find it. + self.it.count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {} + +/// An iterator over all non-overlapping capture matches in a haystack. +/// +/// This iterator yields [`Captures`] values. The iterator stops when no more +/// matches can be found. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the matched string. +/// +/// This iterator is created by [`Regex::captures_iter`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +#[derive(Debug)] +pub struct CaptureMatches<'r, 'h> { + haystack: &'h str, + it: meta::CapturesMatches<'r, 'h>, +} + +impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> { + type Item = Captures<'h>; + + #[inline] + fn next(&mut self) -> Option<Captures<'h>> { + let static_captures_len = self.it.regex().static_captures_len(); + self.it.next().map(|caps| Captures { + haystack: self.haystack, + caps, + static_captures_len, + }) + } + + #[inline] + fn count(self) -> usize { + // This can actually be up to 2x faster than calling `next()` until + // completion, because counting matches when using a DFA only requires + // finding the end of each match. But returning a `Match` via `next()` + // requires the start of each match which, with a DFA, requires a + // reverse forward scan to find it. + self.it.count() + } +} + +impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {} + +/// An iterator over all substrings delimited by a regex match. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the byte string being split. +/// +/// This iterator is created by [`Regex::split`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +#[derive(Debug)] +pub struct Split<'r, 'h> { + haystack: &'h str, + it: meta::Split<'r, 'h>, +} + +impl<'r, 'h> Iterator for Split<'r, 'h> { + type Item = &'h str; + + #[inline] + fn next(&mut self) -> Option<&'h str> { + self.it.next().map(|span| &self.haystack[span]) + } +} + +impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {} + +/// An iterator over at most `N` substrings delimited by a regex match. +/// +/// The last substring yielded by this iterator will be whatever remains after +/// `N-1` splits. +/// +/// `'r` is the lifetime of the compiled regular expression and `'h` is the +/// lifetime of the byte string being split. +/// +/// This iterator is created by [`Regex::splitn`]. +/// +/// # Time complexity +/// +/// Note that since an iterator runs potentially many searches on the haystack +/// and since each search has worst case `O(m * n)` time complexity, the +/// overall worst case time complexity for iteration is `O(m * n^2)`. +/// +/// Although note that the worst case time here has an upper bound given +/// by the `limit` parameter to [`Regex::splitn`]. +#[derive(Debug)] +pub struct SplitN<'r, 'h> { + haystack: &'h str, + it: meta::SplitN<'r, 'h>, +} + +impl<'r, 'h> Iterator for SplitN<'r, 'h> { + type Item = &'h str; + + #[inline] + fn next(&mut self) -> Option<&'h str> { + self.it.next().map(|span| &self.haystack[span]) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } +} + +impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} + +/// An iterator over the names of all capture groups in a regex. +/// +/// This iterator yields values of type `Option<&str>` in order of the opening +/// capture group parenthesis in the regex pattern. `None` is yielded for +/// groups with no name. The first element always corresponds to the implicit +/// and unnamed group for the overall match. +/// +/// `'r` is the lifetime of the compiled regular expression. +/// +/// This iterator is created by [`Regex::capture_names`]. +#[derive(Clone, Debug)] +pub struct CaptureNames<'r>(captures::GroupInfoPatternNames<'r>); + +impl<'r> Iterator for CaptureNames<'r> { + type Item = Option<&'r str>; + + #[inline] + fn next(&mut self) -> Option<Option<&'r str>> { + self.0.next() + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.0.count() + } +} + +impl<'r> ExactSizeIterator for CaptureNames<'r> {} + +impl<'r> core::iter::FusedIterator for CaptureNames<'r> {} + +/// An iterator over all group matches in a [`Captures`] value. +/// +/// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the +/// lifetime of the haystack that the matches are for. The order of elements +/// yielded corresponds to the order of the opening parenthesis for the group +/// in the regex pattern. `None` is yielded for groups that did not participate +/// in the match. +/// +/// The first element always corresponds to the implicit group for the overall +/// match. Since this iterator is created by a [`Captures`] value, and a +/// `Captures` value is only created when a match occurs, it follows that the +/// first element yielded by this iterator is guaranteed to be non-`None`. +/// +/// The lifetime `'c` corresponds to the lifetime of the `Captures` value that +/// created this iterator, and the lifetime `'h` corresponds to the originally +/// matched haystack. +#[derive(Clone, Debug)] +pub struct SubCaptureMatches<'c, 'h> { + haystack: &'h str, + it: captures::CapturesPatternIter<'c>, +} + +impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> { + type Item = Option<Match<'h>>; + + #[inline] + fn next(&mut self) -> Option<Option<Match<'h>>> { + self.it.next().map(|group| { + group.map(|sp| Match::new(self.haystack, sp.start, sp.end)) + }) + } + + #[inline] + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } + + #[inline] + fn count(self) -> usize { + self.it.count() + } +} + +impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {} + +impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {} + +/// A trait for types that can be used to replace matches in a haystack. +/// +/// In general, users of this crate shouldn't need to implement this trait, +/// since implementations are already provided for `&str` along with other +/// variants of string types, as well as `FnMut(&Captures) -> String` (or any +/// `FnMut(&Captures) -> T` where `T: AsRef<str>`). Those cover most use cases, +/// but callers can implement this trait directly if necessary. +/// +/// # Example +/// +/// This example shows a basic implementation of the `Replacer` trait. This +/// can be done much more simply using the replacement string interpolation +/// support (e.g., `$first $last`), but this approach avoids needing to parse +/// the replacement string at all. +/// +/// ``` +/// use regex::{Captures, Regex, Replacer}; +/// +/// struct NameSwapper; +/// +/// impl Replacer for NameSwapper { +/// fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { +/// dst.push_str(&caps["first"]); +/// dst.push_str(" "); +/// dst.push_str(&caps["last"]); +/// } +/// } +/// +/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); +/// let result = re.replace("Springsteen, Bruce", NameSwapper); +/// assert_eq!(result, "Bruce Springsteen"); +/// ``` +pub trait Replacer { + /// Appends possibly empty data to `dst` to replace the current match. + /// + /// The current match is represented by `caps`, which is guaranteed to + /// have a match at capture group `0`. + /// + /// For example, a no-op replacement would be `dst.push_str(&caps[0])`. + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); + + /// Return a fixed unchanging replacement string. + /// + /// When doing replacements, if access to [`Captures`] is not needed (e.g., + /// the replacement string does not need `$` expansion), then it can be + /// beneficial to avoid finding sub-captures. + /// + /// In general, this is called once for every call to a replacement routine + /// such as [`Regex::replace_all`]. + fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> { + None + } + + /// Returns a type that implements `Replacer`, but that borrows and wraps + /// this `Replacer`. + /// + /// This is useful when you want to take a generic `Replacer` (which might + /// not be cloneable) and use it without consuming it, so it can be used + /// more than once. + /// + /// # Example + /// + /// ``` + /// use regex::{Regex, Replacer}; + /// + /// fn replace_all_twice<R: Replacer>( + /// re: Regex, + /// src: &str, + /// mut rep: R, + /// ) -> String { + /// let dst = re.replace_all(src, rep.by_ref()); + /// let dst = re.replace_all(&dst, rep.by_ref()); + /// dst.into_owned() + /// } + /// ``` + fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { + ReplacerRef(self) + } +} + +impl<'a> Replacer for &'a str { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + caps.expand(*self, dst); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl Replacer for String { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_str().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<'a> Replacer for &'a Cow<'a, str> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.as_ref().replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + no_expansion(self) + } +} + +impl<F, T> Replacer for F +where + F: FnMut(&Captures<'_>) -> T, + T: AsRef<str>, +{ + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + dst.push_str((*self)(caps).as_ref()); + } +} + +/// A by-reference adaptor for a [`Replacer`]. +/// +/// This permits reusing the same `Replacer` value in multiple calls to a +/// replacement routine like [`Regex::replace_all`]. +/// +/// This type is created by [`Replacer::by_ref`]. +#[derive(Debug)] +pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); + +impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { + fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { + self.0.replace_append(caps, dst) + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + self.0.no_expansion() + } +} + +/// A helper type for forcing literal string replacement. +/// +/// It can be used with routines like [`Regex::replace`] and +/// [`Regex::replace_all`] to do a literal string replacement without expanding +/// `$name` to their corresponding capture groups. This can be both convenient +/// (to avoid escaping `$`, for example) and faster (since capture groups +/// don't need to be found). +/// +/// `'s` is the lifetime of the literal string to use. +/// +/// # Example +/// +/// ``` +/// use regex::{NoExpand, Regex}; +/// +/// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); +/// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); +/// assert_eq!(result, "$2 $last"); +/// ``` +#[derive(Clone, Debug)] +pub struct NoExpand<'s>(pub &'s str); + +impl<'s> Replacer for NoExpand<'s> { + fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { + dst.push_str(self.0); + } + + fn no_expansion(&mut self) -> Option<Cow<'_, str>> { + Some(Cow::Borrowed(self.0)) + } +} + +/// Quickly checks the given replacement string for whether interpolation +/// should be done on it. It returns `None` if a `$` was found anywhere in the +/// given string, which suggests interpolation needs to be done. But if there's +/// no `$` anywhere, then interpolation definitely does not need to be done. In +/// that case, the given string is returned as a borrowed `Cow`. +/// +/// This is meant to be used to implement the `Replacer::no_expandsion` method +/// in its various trait impls. +fn no_expansion<T: AsRef<str>>(replacement: &T) -> Option<Cow<'_, str>> { + let replacement = replacement.as_ref(); + match crate::find_byte::find_byte(b'$', replacement.as_bytes()) { + Some(_) => None, + None => Some(Cow::Borrowed(replacement)), + } +} diff --git a/src/regexset/bytes.rs b/src/regexset/bytes.rs new file mode 100644 index 0000000000..1220a14662 --- /dev/null +++ b/src/regexset/bytes.rs @@ -0,0 +1,710 @@ +use alloc::string::String; + +use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter}; + +use crate::{bytes::RegexSetBuilder, Error}; + +/// Match multiple, possibly overlapping, regexes in a single search. +/// +/// A regex set corresponds to the union of zero or more regular expressions. +/// That is, a regex set will match a haystack when at least one of its +/// constituent regexes matches. A regex set as its formulated here provides a +/// touch more power: it will also report *which* regular expressions in the +/// set match. Indeed, this is the key difference between regex sets and a +/// single `Regex` with many alternates, since only one alternate can match at +/// a time. +/// +/// For example, consider regular expressions to match email addresses and +/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a +/// regex set is constructed from those regexes, then searching the haystack +/// `foo@example.com` will report both regexes as matching. Of course, one +/// could accomplish this by compiling each regex on its own and doing two +/// searches over the haystack. The key advantage of using a regex set is +/// that it will report the matching regexes using a *single pass through the +/// haystack*. If one has hundreds or thousands of regexes to match repeatedly +/// (like a URL router for a complex web application or a user agent matcher), +/// then a regex set *can* realize huge performance gains. +/// +/// Unlike the top-level [`RegexSet`](crate::RegexSet), this `RegexSet` +/// searches haystacks with type `&[u8]` instead of `&str`. Consequently, this +/// `RegexSet` is permitted to match invalid UTF-8. +/// +/// # Limitations +/// +/// Regex sets are limited to answering the following two questions: +/// +/// 1. Does any regex in the set match? +/// 2. If so, which regexes in the set match? +/// +/// As with the main [`Regex`][crate::bytes::Regex] type, it is cheaper to ask +/// (1) instead of (2) since the matching engines can stop after the first +/// match is found. +/// +/// You cannot directly extract [`Match`][crate::bytes::Match] or +/// [`Captures`][crate::bytes::Captures] objects from a regex set. If you need +/// these operations, the recommended approach is to compile each pattern in +/// the set independently and scan the exact same haystack a second time with +/// those independently compiled patterns: +/// +/// ``` +/// use regex::bytes::{Regex, RegexSet}; +/// +/// let patterns = ["foo", "bar"]; +/// // Both patterns will match different ranges of this string. +/// let hay = b"barfoo"; +/// +/// // Compile a set matching any of our patterns. +/// let set = RegexSet::new(patterns).unwrap(); +/// // Compile each pattern independently. +/// let regexes: Vec<_> = set +/// .patterns() +/// .iter() +/// .map(|pat| Regex::new(pat).unwrap()) +/// .collect(); +/// +/// // Match against the whole set first and identify the individual +/// // matching patterns. +/// let matches: Vec<&[u8]> = set +/// .matches(hay) +/// .into_iter() +/// // Dereference the match index to get the corresponding +/// // compiled pattern. +/// .map(|index| ®exes[index]) +/// // To get match locations or any other info, we then have to search the +/// // exact same haystack again, using our separately-compiled pattern. +/// .map(|re| re.find(hay).unwrap().as_bytes()) +/// .collect(); +/// +/// // Matches arrive in the order the constituent patterns were declared, +/// // not the order they appear in the haystack. +/// assert_eq!(vec![&b"foo"[..], &b"bar"[..]], matches); +/// ``` +/// +/// # Performance +/// +/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, +/// search takes `O(m * n)` time, where `m` is proportional to the size of the +/// regex set and `n` is proportional to the length of the haystack. +/// +/// # Trait implementations +/// +/// The `Default` trait is implemented for `RegexSet`. The default value +/// is an empty set. An empty set can also be explicitly constructed via +/// [`RegexSet::empty`]. +/// +/// # Example +/// +/// This shows how the above two regexes (for matching email addresses and +/// domains) might work: +/// +/// ``` +/// use regex::bytes::RegexSet; +/// +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match(b"foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with a haystack that only matches one of the regexes. +/// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with a haystack that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +/// +/// Note that it would be possible to adapt the above example to using `Regex` +/// with an expression like: +/// +/// ```text +/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) +/// ``` +/// +/// After a match, one could then inspect the capture groups to figure out +/// which alternates matched. The problem is that it is hard to make this +/// approach scale when there are many regexes since the overlap between each +/// alternate isn't always obvious to reason about. +#[derive(Clone)] +pub struct RegexSet { + pub(crate) meta: meta::Regex, + pub(crate) patterns: alloc::sync::Arc<[String]>, +} + +impl RegexSet { + /// Create a new regex set with the given regular expressions. + /// + /// This takes an iterator of `S`, where `S` is something that can produce + /// a `&str`. If any of the strings in the iterator are not valid regular + /// expressions, then an error is returned. + /// + /// # Example + /// + /// Create a new regex set from an iterator of strings: + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match(b"foo")); + /// ``` + pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> + where + S: AsRef<str>, + I: IntoIterator<Item = S>, + { + RegexSetBuilder::new(exprs).build() + } + + /// Create a new empty regex set. + /// + /// An empty regex never matches anything. + /// + /// This is a convenience function for `RegexSet::new([])`, but doesn't + /// require one to specify the type of the input. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::empty(); + /// assert!(set.is_empty()); + /// // an empty set matches nothing + /// assert!(!set.is_match(b"")); + /// ``` + pub fn empty() -> RegexSet { + let empty: [&str; 0] = []; + RegexSetBuilder::new(empty).build().unwrap() + } + + /// Returns true if and only if one of the regexes in this set matches + /// the haystack given. + /// + /// This method should be preferred if you only need to test whether any + /// of the regexes in the set should match, but don't care about *which* + /// regexes matched. This is because the underlying matching engine will + /// quit immediately after seeing the first match instead of continuing to + /// find all matches. + /// + /// Note that as with searches using [`Regex`](crate::bytes::Regex), the + /// expression is unanchored by default. That is, if the regex does not + /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted + /// to match anywhere in the haystack. + /// + /// # Example + /// + /// Tests whether a set matches somewhere in a haystack: + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match(b"foo")); + /// assert!(!set.is_match("☃".as_bytes())); + /// ``` + #[inline] + pub fn is_match(&self, haystack: &[u8]) -> bool { + self.is_match_at(haystack, 0) + } + + /// Returns true if and only if one of the regexes in this set matches the + /// haystack given, with the search starting at the offset given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start`. Namely, consider a + /// haystack `foobar` and a desire to execute a search starting at offset + /// `3`. You could search a substring explicitly, but then the look-around + /// assertions won't work correctly. Instead, you can use this method to + /// specify the start position of a search. + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap(); + /// let hay = b"foobar"; + /// // We get a match here, but it's probably not intended. + /// assert!(set.is_match(&hay[3..])); + /// // No match because the assertions take the context into account. + /// assert!(!set.is_match_at(hay, 3)); + /// ``` + #[inline] + pub fn is_match_at(&self, haystack: &[u8], start: usize) -> bool { + self.meta.is_match(Input::new(haystack).span(start..haystack.len())) + } + + /// Returns the set of regexes that match in the given haystack. + /// + /// The set returned contains the index of each regex that matches in + /// the given haystack. The index is in correspondence with the order of + /// regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. The order + /// of iteration is always ascending with respect to the matching indices. + /// + /// Note that as with searches using [`Regex`](crate::bytes::Regex), the + /// expression is unanchored by default. That is, if the regex does not + /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted + /// to match anywhere in the haystack. + /// + /// # Example + /// + /// Tests which regular expressions match the given haystack: + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set.matches(b"foobar").into_iter().collect(); + /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); + /// + /// // You can also test whether a particular regex matched: + /// let matches = set.matches(b"foobar"); + /// assert!(!matches.matched(5)); + /// assert!(matches.matched(6)); + /// ``` + #[inline] + pub fn matches(&self, haystack: &[u8]) -> SetMatches { + self.matches_at(haystack, 0) + } + + /// Returns the set of regexes that match in the given haystack. + /// + /// The set returned contains the index of each regex that matches in + /// the given haystack. The index is in correspondence with the order of + /// regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. The order + /// of iteration is always ascending with respect to the matching indices. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// Tests which regular expressions match the given haystack: + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap(); + /// let hay = b"foobar"; + /// // We get matches here, but it's probably not intended. + /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect(); + /// assert_eq!(matches, vec![0, 1]); + /// // No matches because the assertions take the context into account. + /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect(); + /// assert_eq!(matches, vec![]); + /// ``` + #[inline] + pub fn matches_at(&self, haystack: &[u8], start: usize) -> SetMatches { + let input = Input::new(haystack).span(start..haystack.len()); + let mut patset = PatternSet::new(self.meta.pattern_len()); + self.meta.which_overlapping_matches(&input, &mut patset); + SetMatches(patset) + } + + /// Returns the same as matches, but starts the search at the given + /// offset and stores the matches into the slice given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// `matches` must have a length that is at least the number of regexes + /// in this set. + /// + /// This method returns true if and only if at least one member of + /// `matches` is true after executing the set against `haystack`. + #[doc(hidden)] + #[inline] + pub fn matches_read_at( + &self, + matches: &mut [bool], + haystack: &[u8], + start: usize, + ) -> bool { + // This is pretty dumb. We should try to fix this, but the + // regex-automata API doesn't provide a way to store matches in an + // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and + // thus not public... But regex-capi currently uses it. We should + // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet + // is in regex-automata, not regex. So maybe we should just accept a + // 'SetMatches', which is basically just a newtype around PatternSet. + let mut patset = PatternSet::new(self.meta.pattern_len()); + let mut input = Input::new(haystack); + input.set_start(start); + self.meta.which_overlapping_matches(&input, &mut patset); + for pid in patset.iter() { + matches[pid] = true; + } + !patset.is_empty() + } + + /// An alias for `matches_read_at` to preserve backward compatibility. + /// + /// The `regex-capi` crate used this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + #[inline] + pub fn read_matches_at( + &self, + matches: &mut [bool], + haystack: &[u8], + start: usize, + ) -> bool { + self.matches_read_at(matches, haystack, start) + } + + /// Returns the total number of regexes in this set. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// assert_eq!(0, RegexSet::empty().len()); + /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len()); + /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len()); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.meta.pattern_len() + } + + /// Returns `true` if this set contains no regexes. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// assert!(RegexSet::empty().is_empty()); + /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty()); + /// ``` + #[inline] + pub fn is_empty(&self) -> bool { + self.meta.pattern_len() == 0 + } + + /// Returns the regex patterns that this regex set was constructed from. + /// + /// This function can be used to determine the pattern for a match. The + /// slice returned has exactly as many patterns givens to this regex set, + /// and the order of the slice is the same as the order of the patterns + /// provided to the set. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set + /// .matches(b"foobar") + /// .into_iter() + /// .map(|index| &set.patterns()[index]) + /// .collect(); + /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); + /// ``` + #[inline] + pub fn patterns(&self) -> &[String] { + &self.patterns + } +} + +impl Default for RegexSet { + fn default() -> Self { + RegexSet::empty() + } +} + +/// A set of matches returned by a regex set. +/// +/// Values of this type are constructed by [`RegexSet::matches`]. +#[derive(Clone, Debug)] +pub struct SetMatches(PatternSet); + +impl SetMatches { + /// Whether this set contains any matches. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new(&[ + /// r"[a-z]+@[a-z]+\.(com|org|net)", + /// r"[a-z]+\.(com|org|net)", + /// ]).unwrap(); + /// let matches = set.matches(b"foo@example.com"); + /// assert!(matches.matched_any()); + /// ``` + #[inline] + pub fn matched_any(&self) -> bool { + !self.0.is_empty() + } + + /// Whether the regex at the given index matched. + /// + /// The index for a regex is determined by its insertion order upon the + /// initial construction of a `RegexSet`, starting at `0`. + /// + /// # Panics + /// + /// If `index` is greater than or equal to the number of regexes in the + /// original set that produced these matches. Equivalently, when `index` + /// is greater than or equal to [`SetMatches::len`]. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[a-z]+@[a-z]+\.(com|org|net)", + /// r"[a-z]+\.(com|org|net)", + /// ]).unwrap(); + /// let matches = set.matches(b"example.com"); + /// assert!(!matches.matched(0)); + /// assert!(matches.matched(1)); + /// ``` + #[inline] + pub fn matched(&self, index: usize) -> bool { + self.0.contains(PatternID::new_unchecked(index)) + } + + /// The total number of regexes in the set that created these matches. + /// + /// **WARNING:** This always returns the same value as [`RegexSet::len`]. + /// In particular, it does *not* return the number of elements yielded by + /// [`SetMatches::iter`]. The only way to determine the total number of + /// matched regexes is to iterate over them. + /// + /// # Example + /// + /// Notice that this method returns the total number of regexes in the + /// original set, and *not* the total number of regexes that matched. + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[a-z]+@[a-z]+\.(com|org|net)", + /// r"[a-z]+\.(com|org|net)", + /// ]).unwrap(); + /// let matches = set.matches(b"example.com"); + /// // Total number of patterns that matched. + /// assert_eq!(1, matches.iter().count()); + /// // Total number of patterns in the set. + /// assert_eq!(2, matches.len()); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.0.capacity() + } + + /// Returns an iterator over the indices of the regexes that matched. + /// + /// This will always produces matches in ascending order, where the index + /// yielded corresponds to the index of the regex that matched with respect + /// to its position when initially building the set. + /// + /// # Example + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[0-9]", + /// r"[a-z]", + /// r"[A-Z]", + /// r"\p{Greek}", + /// ]).unwrap(); + /// let hay = "βa1".as_bytes(); + /// let matches: Vec<_> = set.matches(hay).iter().collect(); + /// assert_eq!(matches, vec![0, 1, 3]); + /// ``` + /// + /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so + /// this method is not always needed. For example: + /// + /// ``` + /// use regex::bytes::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[0-9]", + /// r"[a-z]", + /// r"[A-Z]", + /// r"\p{Greek}", + /// ]).unwrap(); + /// let hay = "βa1".as_bytes(); + /// let mut matches = vec![]; + /// for index in set.matches(hay) { + /// matches.push(index); + /// } + /// assert_eq!(matches, vec![0, 1, 3]); + /// ``` + #[inline] + pub fn iter(&self) -> SetMatchesIter<'_> { + SetMatchesIter(self.0.iter()) + } +} + +impl IntoIterator for SetMatches { + type IntoIter = SetMatchesIntoIter; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + let it = 0..self.0.capacity(); + SetMatchesIntoIter { patset: self.0, it } + } +} + +impl<'a> IntoIterator for &'a SetMatches { + type IntoIter = SetMatchesIter<'a>; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An owned iterator over the set of matches from a regex set. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +/// +/// This iterator is created by calling `SetMatches::into_iter` via the +/// `IntoIterator` trait. This is automatically done in `for` loops. +/// +/// # Example +/// +/// ``` +/// use regex::bytes::RegexSet; +/// +/// let set = RegexSet::new([ +/// r"[0-9]", +/// r"[a-z]", +/// r"[A-Z]", +/// r"\p{Greek}", +/// ]).unwrap(); +/// let hay = "βa1".as_bytes(); +/// let mut matches = vec![]; +/// for index in set.matches(hay) { +/// matches.push(index); +/// } +/// assert_eq!(matches, vec![0, 1, 3]); +/// ``` +#[derive(Debug)] +pub struct SetMatchesIntoIter { + patset: PatternSet, + it: core::ops::Range<usize>, +} + +impl Iterator for SetMatchesIntoIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + let id = self.it.next()?; + if self.patset.contains(PatternID::new_unchecked(id)) { + return Some(id); + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } +} + +impl DoubleEndedIterator for SetMatchesIntoIter { + fn next_back(&mut self) -> Option<usize> { + loop { + let id = self.it.next_back()?; + if self.patset.contains(PatternID::new_unchecked(id)) { + return Some(id); + } + } + } +} + +impl core::iter::FusedIterator for SetMatchesIntoIter {} + +/// A borrowed iterator over the set of matches from a regex set. +/// +/// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that +/// created this iterator. +/// +/// This will always produces matches in ascending order, where the index +/// corresponds to the index of the regex that matched with respect to its +/// position when initially building the set. +/// +/// This iterator is created by the [`SetMatches::iter`] method. +#[derive(Clone, Debug)] +pub struct SetMatchesIter<'a>(PatternSetIter<'a>); + +impl<'a> Iterator for SetMatchesIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + self.0.next().map(|pid| pid.as_usize()) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { + fn next_back(&mut self) -> Option<usize> { + self.0.next_back().map(|pid| pid.as_usize()) + } +} + +impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {} + +impl core::fmt::Debug for RegexSet { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "RegexSet({:?})", self.patterns()) + } +} diff --git a/src/regexset/mod.rs b/src/regexset/mod.rs new file mode 100644 index 0000000000..93fadec8bf --- /dev/null +++ b/src/regexset/mod.rs @@ -0,0 +1,2 @@ +pub(crate) mod bytes; +pub(crate) mod string; diff --git a/src/regexset/string.rs b/src/regexset/string.rs new file mode 100644 index 0000000000..2a3e7b8027 --- /dev/null +++ b/src/regexset/string.rs @@ -0,0 +1,706 @@ +use alloc::string::String; + +use regex_automata::{meta, Input, PatternID, PatternSet, PatternSetIter}; + +use crate::{Error, RegexSetBuilder}; + +/// Match multiple, possibly overlapping, regexes in a single search. +/// +/// A regex set corresponds to the union of zero or more regular expressions. +/// That is, a regex set will match a haystack when at least one of its +/// constituent regexes matches. A regex set as its formulated here provides a +/// touch more power: it will also report *which* regular expressions in the +/// set match. Indeed, this is the key difference between regex sets and a +/// single `Regex` with many alternates, since only one alternate can match at +/// a time. +/// +/// For example, consider regular expressions to match email addresses and +/// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a +/// regex set is constructed from those regexes, then searching the haystack +/// `foo@example.com` will report both regexes as matching. Of course, one +/// could accomplish this by compiling each regex on its own and doing two +/// searches over the haystack. The key advantage of using a regex set is +/// that it will report the matching regexes using a *single pass through the +/// haystack*. If one has hundreds or thousands of regexes to match repeatedly +/// (like a URL router for a complex web application or a user agent matcher), +/// then a regex set *can* realize huge performance gains. +/// +/// # Limitations +/// +/// Regex sets are limited to answering the following two questions: +/// +/// 1. Does any regex in the set match? +/// 2. If so, which regexes in the set match? +/// +/// As with the main [`Regex`][crate::Regex] type, it is cheaper to ask (1) +/// instead of (2) since the matching engines can stop after the first match +/// is found. +/// +/// You cannot directly extract [`Match`][crate::Match] or +/// [`Captures`][crate::Captures] objects from a regex set. If you need these +/// operations, the recommended approach is to compile each pattern in the set +/// independently and scan the exact same haystack a second time with those +/// independently compiled patterns: +/// +/// ``` +/// use regex::{Regex, RegexSet}; +/// +/// let patterns = ["foo", "bar"]; +/// // Both patterns will match different ranges of this string. +/// let hay = "barfoo"; +/// +/// // Compile a set matching any of our patterns. +/// let set = RegexSet::new(patterns).unwrap(); +/// // Compile each pattern independently. +/// let regexes: Vec<_> = set +/// .patterns() +/// .iter() +/// .map(|pat| Regex::new(pat).unwrap()) +/// .collect(); +/// +/// // Match against the whole set first and identify the individual +/// // matching patterns. +/// let matches: Vec<&str> = set +/// .matches(hay) +/// .into_iter() +/// // Dereference the match index to get the corresponding +/// // compiled pattern. +/// .map(|index| ®exes[index]) +/// // To get match locations or any other info, we then have to search the +/// // exact same haystack again, using our separately-compiled pattern. +/// .map(|re| re.find(hay).unwrap().as_str()) +/// .collect(); +/// +/// // Matches arrive in the order the constituent patterns were declared, +/// // not the order they appear in the haystack. +/// assert_eq!(vec!["foo", "bar"], matches); +/// ``` +/// +/// # Performance +/// +/// A `RegexSet` has the same performance characteristics as `Regex`. Namely, +/// search takes `O(m * n)` time, where `m` is proportional to the size of the +/// regex set and `n` is proportional to the length of the haystack. +/// +/// # Trait implementations +/// +/// The `Default` trait is implemented for `RegexSet`. The default value +/// is an empty set. An empty set can also be explicitly constructed via +/// [`RegexSet::empty`]. +/// +/// # Example +/// +/// This shows how the above two regexes (for matching email addresses and +/// domains) might work: +/// +/// ``` +/// use regex::RegexSet; +/// +/// let set = RegexSet::new(&[ +/// r"[a-z]+@[a-z]+\.(com|org|net)", +/// r"[a-z]+\.(com|org|net)", +/// ]).unwrap(); +/// +/// // Ask whether any regexes in the set match. +/// assert!(set.is_match("foo@example.com")); +/// +/// // Identify which regexes in the set match. +/// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); +/// assert_eq!(vec![0, 1], matches); +/// +/// // Try again, but with a haystack that only matches one of the regexes. +/// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); +/// assert_eq!(vec![1], matches); +/// +/// // Try again, but with a haystack that doesn't match any regex in the set. +/// let matches: Vec<_> = set.matches("example").into_iter().collect(); +/// assert!(matches.is_empty()); +/// ``` +/// +/// Note that it would be possible to adapt the above example to using `Regex` +/// with an expression like: +/// +/// ```text +/// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) +/// ``` +/// +/// After a match, one could then inspect the capture groups to figure out +/// which alternates matched. The problem is that it is hard to make this +/// approach scale when there are many regexes since the overlap between each +/// alternate isn't always obvious to reason about. +#[derive(Clone)] +pub struct RegexSet { + pub(crate) meta: meta::Regex, + pub(crate) patterns: alloc::sync::Arc<[String]>, +} + +impl RegexSet { + /// Create a new regex set with the given regular expressions. + /// + /// This takes an iterator of `S`, where `S` is something that can produce + /// a `&str`. If any of the strings in the iterator are not valid regular + /// expressions, then an error is returned. + /// + /// # Example + /// + /// Create a new regex set from an iterator of strings: + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// ``` + pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> + where + S: AsRef<str>, + I: IntoIterator<Item = S>, + { + RegexSetBuilder::new(exprs).build() + } + + /// Create a new empty regex set. + /// + /// An empty regex never matches anything. + /// + /// This is a convenience function for `RegexSet::new([])`, but doesn't + /// require one to specify the type of the input. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::empty(); + /// assert!(set.is_empty()); + /// // an empty set matches nothing + /// assert!(!set.is_match("")); + /// ``` + pub fn empty() -> RegexSet { + let empty: [&str; 0] = []; + RegexSetBuilder::new(empty).build().unwrap() + } + + /// Returns true if and only if one of the regexes in this set matches + /// the haystack given. + /// + /// This method should be preferred if you only need to test whether any + /// of the regexes in the set should match, but don't care about *which* + /// regexes matched. This is because the underlying matching engine will + /// quit immediately after seeing the first match instead of continuing to + /// find all matches. + /// + /// Note that as with searches using [`Regex`](crate::Regex), the + /// expression is unanchored by default. That is, if the regex does not + /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted + /// to match anywhere in the haystack. + /// + /// # Example + /// + /// Tests whether a set matches somewhere in a haystack: + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([r"\w+", r"\d+"]).unwrap(); + /// assert!(set.is_match("foo")); + /// assert!(!set.is_match("☃")); + /// ``` + #[inline] + pub fn is_match(&self, haystack: &str) -> bool { + self.is_match_at(haystack, 0) + } + + /// Returns true if and only if one of the regexes in this set matches the + /// haystack given, with the search starting at the offset given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// This example shows the significance of `start`. Namely, consider a + /// haystack `foobar` and a desire to execute a search starting at offset + /// `3`. You could search a substring explicitly, but then the look-around + /// assertions won't work correctly. Instead, you can use this method to + /// specify the start position of a search. + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap(); + /// let hay = "foobar"; + /// // We get a match here, but it's probably not intended. + /// assert!(set.is_match(&hay[3..])); + /// // No match because the assertions take the context into account. + /// assert!(!set.is_match_at(hay, 3)); + /// ``` + #[inline] + pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { + self.meta.is_match(Input::new(haystack).span(start..haystack.len())) + } + + /// Returns the set of regexes that match in the given haystack. + /// + /// The set returned contains the index of each regex that matches in + /// the given haystack. The index is in correspondence with the order of + /// regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. The order + /// of iteration is always ascending with respect to the matching indices. + /// + /// Note that as with searches using [`Regex`](crate::Regex), the + /// expression is unanchored by default. That is, if the regex does not + /// start with `^` or `\A`, or end with `$` or `\z`, then it is permitted + /// to match anywhere in the haystack. + /// + /// # Example + /// + /// Tests which regular expressions match the given haystack: + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); + /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); + /// + /// // You can also test whether a particular regex matched: + /// let matches = set.matches("foobar"); + /// assert!(!matches.matched(5)); + /// assert!(matches.matched(6)); + /// ``` + #[inline] + pub fn matches(&self, haystack: &str) -> SetMatches { + self.matches_at(haystack, 0) + } + + /// Returns the set of regexes that match in the given haystack. + /// + /// The set returned contains the index of each regex that matches in + /// the given haystack. The index is in correspondence with the order of + /// regular expressions given to `RegexSet`'s constructor. + /// + /// The set can also be used to iterate over the matched indices. The order + /// of iteration is always ascending with respect to the matching indices. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// # Panics + /// + /// This panics when `start >= haystack.len() + 1`. + /// + /// # Example + /// + /// Tests which regular expressions match the given haystack: + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([r"\bbar\b", r"(?m)^bar$"]).unwrap(); + /// let hay = "foobar"; + /// // We get matches here, but it's probably not intended. + /// let matches: Vec<_> = set.matches(&hay[3..]).into_iter().collect(); + /// assert_eq!(matches, vec![0, 1]); + /// // No matches because the assertions take the context into account. + /// let matches: Vec<_> = set.matches_at(hay, 3).into_iter().collect(); + /// assert_eq!(matches, vec![]); + /// ``` + #[inline] + pub fn matches_at(&self, haystack: &str, start: usize) -> SetMatches { + let input = Input::new(haystack).span(start..haystack.len()); + let mut patset = PatternSet::new(self.meta.pattern_len()); + self.meta.which_overlapping_matches(&input, &mut patset); + SetMatches(patset) + } + + /// Returns the same as matches, but starts the search at the given + /// offset and stores the matches into the slice given. + /// + /// The significance of the starting point is that it takes the surrounding + /// context into consideration. For example, the `\A` anchor can only + /// match when `start == 0`. + /// + /// `matches` must have a length that is at least the number of regexes + /// in this set. + /// + /// This method returns true if and only if at least one member of + /// `matches` is true after executing the set against `haystack`. + #[doc(hidden)] + #[inline] + pub fn matches_read_at( + &self, + matches: &mut [bool], + haystack: &str, + start: usize, + ) -> bool { + // This is pretty dumb. We should try to fix this, but the + // regex-automata API doesn't provide a way to store matches in an + // arbitrary &mut [bool]. Thankfully, this API is is doc(hidden) and + // thus not public... But regex-capi currently uses it. We should + // fix regex-capi to use a PatternSet, maybe? Not sure... PatternSet + // is in regex-automata, not regex. So maybe we should just accept a + // 'SetMatches', which is basically just a newtype around PatternSet. + let mut patset = PatternSet::new(self.meta.pattern_len()); + let mut input = Input::new(haystack); + input.set_start(start); + self.meta.which_overlapping_matches(&input, &mut patset); + for pid in patset.iter() { + matches[pid] = true; + } + !patset.is_empty() + } + + /// An alias for `matches_read_at` to preserve backward compatibility. + /// + /// The `regex-capi` crate used this method, so to avoid breaking that + /// crate, we continue to export it as an undocumented API. + #[doc(hidden)] + #[inline] + pub fn read_matches_at( + &self, + matches: &mut [bool], + haystack: &str, + start: usize, + ) -> bool { + self.matches_read_at(matches, haystack, start) + } + + /// Returns the total number of regexes in this set. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSet; + /// + /// assert_eq!(0, RegexSet::empty().len()); + /// assert_eq!(1, RegexSet::new([r"[0-9]"]).unwrap().len()); + /// assert_eq!(2, RegexSet::new([r"[0-9]", r"[a-z]"]).unwrap().len()); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.meta.pattern_len() + } + + /// Returns `true` if this set contains no regexes. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSet; + /// + /// assert!(RegexSet::empty().is_empty()); + /// assert!(!RegexSet::new([r"[0-9]"]).unwrap().is_empty()); + /// ``` + #[inline] + pub fn is_empty(&self) -> bool { + self.meta.pattern_len() == 0 + } + + /// Returns the regex patterns that this regex set was constructed from. + /// + /// This function can be used to determine the pattern for a match. The + /// slice returned has exactly as many patterns givens to this regex set, + /// and the order of the slice is the same as the order of the patterns + /// provided to the set. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new(&[ + /// r"\w+", + /// r"\d+", + /// r"\pL+", + /// r"foo", + /// r"bar", + /// r"barfoo", + /// r"foobar", + /// ]).unwrap(); + /// let matches: Vec<_> = set + /// .matches("foobar") + /// .into_iter() + /// .map(|index| &set.patterns()[index]) + /// .collect(); + /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); + /// ``` + #[inline] + pub fn patterns(&self) -> &[String] { + &self.patterns + } +} + +impl Default for RegexSet { + fn default() -> Self { + RegexSet::empty() + } +} + +/// A set of matches returned by a regex set. +/// +/// Values of this type are constructed by [`RegexSet::matches`]. +#[derive(Clone, Debug)] +pub struct SetMatches(PatternSet); + +impl SetMatches { + /// Whether this set contains any matches. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new(&[ + /// r"[a-z]+@[a-z]+\.(com|org|net)", + /// r"[a-z]+\.(com|org|net)", + /// ]).unwrap(); + /// let matches = set.matches("foo@example.com"); + /// assert!(matches.matched_any()); + /// ``` + #[inline] + pub fn matched_any(&self) -> bool { + !self.0.is_empty() + } + + /// Whether the regex at the given index matched. + /// + /// The index for a regex is determined by its insertion order upon the + /// initial construction of a `RegexSet`, starting at `0`. + /// + /// # Panics + /// + /// If `index` is greater than or equal to the number of regexes in the + /// original set that produced these matches. Equivalently, when `index` + /// is greater than or equal to [`SetMatches::len`]. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[a-z]+@[a-z]+\.(com|org|net)", + /// r"[a-z]+\.(com|org|net)", + /// ]).unwrap(); + /// let matches = set.matches("example.com"); + /// assert!(!matches.matched(0)); + /// assert!(matches.matched(1)); + /// ``` + #[inline] + pub fn matched(&self, index: usize) -> bool { + self.0.contains(PatternID::new_unchecked(index)) + } + + /// The total number of regexes in the set that created these matches. + /// + /// **WARNING:** This always returns the same value as [`RegexSet::len`]. + /// In particular, it does *not* return the number of elements yielded by + /// [`SetMatches::iter`]. The only way to determine the total number of + /// matched regexes is to iterate over them. + /// + /// # Example + /// + /// Notice that this method returns the total number of regexes in the + /// original set, and *not* the total number of regexes that matched. + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[a-z]+@[a-z]+\.(com|org|net)", + /// r"[a-z]+\.(com|org|net)", + /// ]).unwrap(); + /// let matches = set.matches("example.com"); + /// // Total number of patterns that matched. + /// assert_eq!(1, matches.iter().count()); + /// // Total number of patterns in the set. + /// assert_eq!(2, matches.len()); + /// ``` + #[inline] + pub fn len(&self) -> usize { + self.0.capacity() + } + + /// Returns an iterator over the indices of the regexes that matched. + /// + /// This will always produces matches in ascending order, where the index + /// yielded corresponds to the index of the regex that matched with respect + /// to its position when initially building the set. + /// + /// # Example + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[0-9]", + /// r"[a-z]", + /// r"[A-Z]", + /// r"\p{Greek}", + /// ]).unwrap(); + /// let hay = "βa1"; + /// let matches: Vec<_> = set.matches(hay).iter().collect(); + /// assert_eq!(matches, vec![0, 1, 3]); + /// ``` + /// + /// Note that `SetMatches` also implemnets the `IntoIterator` trait, so + /// this method is not always needed. For example: + /// + /// ``` + /// use regex::RegexSet; + /// + /// let set = RegexSet::new([ + /// r"[0-9]", + /// r"[a-z]", + /// r"[A-Z]", + /// r"\p{Greek}", + /// ]).unwrap(); + /// let hay = "βa1"; + /// let mut matches = vec![]; + /// for index in set.matches(hay) { + /// matches.push(index); + /// } + /// assert_eq!(matches, vec![0, 1, 3]); + /// ``` + #[inline] + pub fn iter(&self) -> SetMatchesIter<'_> { + SetMatchesIter(self.0.iter()) + } +} + +impl IntoIterator for SetMatches { + type IntoIter = SetMatchesIntoIter; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + let it = 0..self.0.capacity(); + SetMatchesIntoIter { patset: self.0, it } + } +} + +impl<'a> IntoIterator for &'a SetMatches { + type IntoIter = SetMatchesIter<'a>; + type Item = usize; + + fn into_iter(self) -> Self::IntoIter { + self.iter() + } +} + +/// An owned iterator over the set of matches from a regex set. +/// +/// This will always produces matches in ascending order of index, where the +/// index corresponds to the index of the regex that matched with respect to +/// its position when initially building the set. +/// +/// This iterator is created by calling `SetMatches::into_iter` via the +/// `IntoIterator` trait. This is automatically done in `for` loops. +/// +/// # Example +/// +/// ``` +/// use regex::RegexSet; +/// +/// let set = RegexSet::new([ +/// r"[0-9]", +/// r"[a-z]", +/// r"[A-Z]", +/// r"\p{Greek}", +/// ]).unwrap(); +/// let hay = "βa1"; +/// let mut matches = vec![]; +/// for index in set.matches(hay) { +/// matches.push(index); +/// } +/// assert_eq!(matches, vec![0, 1, 3]); +/// ``` +#[derive(Debug)] +pub struct SetMatchesIntoIter { + patset: PatternSet, + it: core::ops::Range<usize>, +} + +impl Iterator for SetMatchesIntoIter { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + loop { + let id = self.it.next()?; + if self.patset.contains(PatternID::new_unchecked(id)) { + return Some(id); + } + } + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.it.size_hint() + } +} + +impl DoubleEndedIterator for SetMatchesIntoIter { + fn next_back(&mut self) -> Option<usize> { + loop { + let id = self.it.next_back()?; + if self.patset.contains(PatternID::new_unchecked(id)) { + return Some(id); + } + } + } +} + +impl core::iter::FusedIterator for SetMatchesIntoIter {} + +/// A borrowed iterator over the set of matches from a regex set. +/// +/// The lifetime `'a` refers to the lifetime of the [`SetMatches`] value that +/// created this iterator. +/// +/// This will always produces matches in ascending order, where the index +/// corresponds to the index of the regex that matched with respect to its +/// position when initially building the set. +/// +/// This iterator is created by the [`SetMatches::iter`] method. +#[derive(Clone, Debug)] +pub struct SetMatchesIter<'a>(PatternSetIter<'a>); + +impl<'a> Iterator for SetMatchesIter<'a> { + type Item = usize; + + fn next(&mut self) -> Option<usize> { + self.0.next().map(|pid| pid.as_usize()) + } + + fn size_hint(&self) -> (usize, Option<usize>) { + self.0.size_hint() + } +} + +impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { + fn next_back(&mut self) -> Option<usize> { + self.0.next_back().map(|pid| pid.as_usize()) + } +} + +impl<'a> core::iter::FusedIterator for SetMatchesIter<'a> {} + +impl core::fmt::Debug for RegexSet { + fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { + write!(f, "RegexSet({:?})", self.patterns()) + } +} diff --git a/test b/test index edeaf4a42d..48224c6d11 100755 --- a/test +++ b/test @@ -13,6 +13,11 @@ cd "$(dirname "$0")" echo "===== DEFAULT FEATURES =====" cargo test +# no-std mode is annoyingly difficult to test. Currently, the integration tests +# don't run. So for now, we just test that library tests run. (There aren't +# many because `regex` is just a wrapper crate.) +cargo test --no-default-features --lib + echo "===== DOC TESTS =====" cargo test --doc @@ -25,6 +30,9 @@ features=( "std perf-dfa" "std perf-inline" "std perf-literal" + "std perf-dfa-full" + "std perf-onepass" + "std perf-backtrack" ) for f in "${features[@]}"; do echo "===== FEATURE: $f =====" diff --git a/testdata/crazy.toml b/testdata/crazy.toml index dad9552eee..aed46ea157 100644 --- a/testdata/crazy.toml +++ b/testdata/crazy.toml @@ -1,5 +1,3 @@ -# TODO: There are still a couple of manually written tests in crazy.rs. - [[test]] name = "nothing-empty" regex = [] diff --git a/testdata/line-terminator.toml b/testdata/line-terminator.toml index 4f06a2847d..4de72de31e 100644 --- a/testdata/line-terminator.toml +++ b/testdata/line-terminator.toml @@ -7,6 +7,16 @@ matches = [[1, 4]] unescape = true line-terminator = '\x00' +# This tests that '.' will not match the configured line terminator, but will +# match \n. +[[test]] +name = "dot-changes-with-line-terminator" +regex = '.' +haystack = '\x00\n' +matches = [[1, 2]] +unescape = true +line-terminator = '\x00' + # This tests that when we switch the line terminator, \n is no longer # recognized as the terminator. [[test]] @@ -26,6 +36,7 @@ haystack = '\xFFabc\xFF' matches = [[1, 4]] unescape = true line-terminator = '\xFF' +utf8 = false # This tests that we can set the line terminator to a byte corresponding to a # word character, and things work as expected. diff --git a/testdata/regression.toml b/testdata/regression.toml index 28cd0f7065..bb5e4fd46f 100644 --- a/testdata/regression.toml +++ b/testdata/regression.toml @@ -728,3 +728,14 @@ name = "missed-match" regex = 'e..+e.ee>' haystack = 'Zeee.eZZZZZZZZeee>eeeeeee>' matches = [[1, 26]] + +# This test came from the 'ignore' crate and tripped a bug in how accelerated +# DFA states were handled in an overlapping search. +[[test]] +name = "regex-to-glob" +regex = ['(?-u)^path1/[^/]*$'] +haystack = "path1/foo" +matches = [[0, 9]] +utf8 = false +match-kind = "all" +search-kind = "overlapping" diff --git a/tests/lib.rs b/tests/lib.rs index 5fdb52e978..badd57455d 100644 --- a/tests/lib.rs +++ b/tests/lib.rs @@ -13,10 +13,7 @@ mod suite_string; mod suite_string_set; const BLACKLIST: &[&str] = &[ - // CRLF-aware line anchors aren't supported in regex API yet. - "crlf", - // Custom line terminators aren't supported in regex API yet. - "line-terminator", + // Nothing to blacklist yet! ]; fn suite() -> anyhow::Result<regex_test::RegexTests> { diff --git a/tests/misc.rs b/tests/misc.rs index 9812142a7c..91e7d28980 100644 --- a/tests/misc.rs +++ b/tests/misc.rs @@ -17,7 +17,10 @@ fn unclosed_group_error() { fn regex_string() { assert_eq!(r"[a-zA-Z0-9]+", regex!(r"[a-zA-Z0-9]+").as_str()); assert_eq!(r"[a-zA-Z0-9]+", &format!("{}", regex!(r"[a-zA-Z0-9]+"))); - assert_eq!(r"[a-zA-Z0-9]+", &format!("{:?}", regex!(r"[a-zA-Z0-9]+"))); + assert_eq!( + r#"Regex("[a-zA-Z0-9]+")"#, + &format!("{:?}", regex!(r"[a-zA-Z0-9]+")) + ); } #[test] diff --git a/tests/regression.rs b/tests/regression.rs index 9a3e087403..a5867016b2 100644 --- a/tests/regression.rs +++ b/tests/regression.rs @@ -79,6 +79,7 @@ fn regression_big_regex_overflow() { assert!(Regex::new(pat).is_err()); } +// See: https://github.com/rust-lang/regex/issues/999 #[test] fn regression_complete_literals_suffix_incorrect() { let needles = vec![ @@ -89,5 +90,5 @@ fn regression_complete_literals_suffix_incorrect() { let pattern = needles.join("|"); let re = regex!(&pattern); let hay = "FUBAR"; - assert_eq!(0, re.find_iter(text!(hay)).count()); + assert_eq!(0, re.find_iter(hay).count()); } diff --git a/tests/suite_bytes.rs b/tests/suite_bytes.rs index 590f00d87e..106d998085 100644 --- a/tests/suite_bytes.rs +++ b/tests/suite_bytes.rs @@ -89,6 +89,7 @@ fn compiler( let re = RegexBuilder::new(pattern) .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) + .line_terminator(test.line_terminator()) .build()?; Ok(CompiledRegex::compiled(move |test| run_test(&re, test))) } diff --git a/tests/suite_bytes_set.rs b/tests/suite_bytes_set.rs index f8c5199e45..899d24c17e 100644 --- a/tests/suite_bytes_set.rs +++ b/tests/suite_bytes_set.rs @@ -65,6 +65,7 @@ fn compiler( let re = RegexSetBuilder::new(test.regexes()) .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) + .line_terminator(test.line_terminator()) .build()?; Ok(CompiledRegex::compiled(move |test| run_test(&re, test))) } diff --git a/tests/suite_string.rs b/tests/suite_string.rs index efc20c338c..1e5bf0bb3b 100644 --- a/tests/suite_string.rs +++ b/tests/suite_string.rs @@ -97,6 +97,7 @@ fn compiler( let re = RegexBuilder::new(pattern) .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) + .line_terminator(test.line_terminator()) .build()?; Ok(CompiledRegex::compiled(move |test| run_test(&re, test))) } diff --git a/tests/suite_string_set.rs b/tests/suite_string_set.rs index 545f0f1cef..dffdc70810 100644 --- a/tests/suite_string_set.rs +++ b/tests/suite_string_set.rs @@ -73,6 +73,7 @@ fn compiler( let re = RegexSetBuilder::new(test.regexes()) .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) + .line_terminator(test.line_terminator()) .build()?; Ok(CompiledRegex::compiled(move |test| run_test(&re, test))) }