Skip to content

Commit

Permalink
LUCENE-9981: more efficient getCommonSuffix/Prefix, and more accurate…
Browse files Browse the repository at this point in the history
… 'effort limit', instead of precise output state limit, during determinize, for throwing TooComplexToDeterminizeException
  • Loading branch information
mikemccand committed Jun 1, 2021
1 parent 27b009c commit c4cf7aa
Show file tree
Hide file tree
Showing 41 changed files with 612 additions and 373 deletions.
12 changes: 12 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,18 @@ Improvements
* LUCENE-9929: Add NorwegianNormalizationFilter, which does the same as ScandinavianNormalizationFilter except
it does not fold oo->ø and ao->å. (janhoy, Robert Muir, Adrien Grand)

* LUCENE-9981: Operations.getCommonSuffix/Prefix(Automaton) is now much more
efficient, from a worst case exponential down to quadratic cost in the
number of states + transitions in the Automaton. These methods no longer
use the costly determinize method, removing the risk of
TooComplexToDeterminizeException (Robert Muir, Mike McCandless)

* LUCENE-9981: Operations.determinize now throws TooComplexToDeterminizeException
based on too much "effort" spent determinizing rather than a precise state
count on the resulting returned automaton, to better handle adversarial
cases like det(rev(regexp("(.*a){2000}"))) that spend lots of effort but
result in smallish eventual returned automata. (Robert Muir, Mike McCandless)

Bug fixes

* LUCENE-9686: Fix read past EOF handling in DirectIODirectory. (Zach Chen,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class TrigramAutomaton {

automaton =
new CharacterRunAutomaton(
Operations.determinize(builder.finish(), Operations.DEFAULT_MAX_DETERMINIZED_STATES));
Operations.determinize(builder.finish(), Operations.DEFAULT_DETERMINIZE_WORK_LIMIT));

state2Score = new int[automaton.getSize()];
for (Map.Entry<String, Integer> entry : substringCounts.entrySet()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public final class ConcatenateGraphFilter extends TokenStream {
/** Represents the default separator between tokens. */
public static final int SEP_LABEL = TokenStreamToAutomaton.POS_SEP;

public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_MAX_DETERMINIZED_STATES;
public static final int DEFAULT_MAX_GRAPH_EXPANSIONS = Operations.DEFAULT_DETERMINIZE_WORK_LIMIT;
public static final Character DEFAULT_TOKEN_SEPARATOR = SEP_LABEL;
public static final boolean DEFAULT_PRESERVE_SEP = true;
public static final boolean DEFAULT_PRESERVE_POSITION_INCREMENTS = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ public final class SimplePatternSplitTokenizer extends Tokenizer {

/** See {@link RegExp} for the accepted syntax. */
public SimplePatternSplitTokenizer(String regexp) {
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

/** Runs a pre-built automaton. */
Expand All @@ -73,8 +73,8 @@ public SimplePatternSplitTokenizer(Automaton dfa) {

/** See {@link RegExp} for the accepted syntax. */
public SimplePatternSplitTokenizer(
AttributeFactory factory, String regexp, int maxDeterminizedStates) {
this(factory, new RegExp(regexp).toAutomaton());
AttributeFactory factory, String regexp, int determinizeWorkLimit) {
this(factory, new RegExp(regexp).toAutomaton(determinizeWorkLimit));
}

/** Runs a pre-built automaton. */
Expand All @@ -88,7 +88,7 @@ public SimplePatternSplitTokenizer(AttributeFactory factory, Automaton dfa) {
throw new IllegalArgumentException("please determinize the incoming automaton first");
}

runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

private void fillToken(int offsetStart) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,9 @@
* <ul>
* <li>"pattern" (required) is the regular expression, according to the syntax described at {@link
* RegExp}
* <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the
* determined automaton computed from the regexp
* <li>"determinizeWorkLimit" (optional, default {@link
* Operations#DEFAULT_DETERMINIZE_WORK_LIMIT}) the limit on total effort to determinize the
* automaton computed from the regexp
* </ul>
*
* <p>The pattern matches the characters that should split tokens, like {@code String.split}, and
Expand Down Expand Up @@ -64,16 +65,16 @@ public class SimplePatternSplitTokenizerFactory extends TokenizerFactory {

public static final String PATTERN = "pattern";
private final Automaton dfa;
private final int maxDeterminizedStates;
private final int determinizeWorkLimit;

/** Creates a new SimpleSplitPatternTokenizerFactory */
public SimplePatternSplitTokenizerFactory(Map<String, String> args) {
super(args);
maxDeterminizedStates =
getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
determinizeWorkLimit =
getInt(args, "determinizeWorkLimit", Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
dfa =
Operations.determinize(
new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
new RegExp(require(args, PATTERN)).toAutomaton(), determinizeWorkLimit);
if (args.isEmpty() == false) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public final class SimplePatternTokenizer extends Tokenizer {

/** See {@link RegExp} for the accepted syntax. */
public SimplePatternTokenizer(String regexp) {
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(DEFAULT_TOKEN_ATTRIBUTE_FACTORY, regexp, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

/** Runs a pre-built automaton. */
Expand All @@ -83,8 +83,7 @@ public SimplePatternTokenizer(Automaton dfa) {
}

/** See {@link RegExp} for the accepted syntax. */
public SimplePatternTokenizer(
AttributeFactory factory, String regexp, int maxDeterminizedStates) {
public SimplePatternTokenizer(AttributeFactory factory, String regexp, int determinizeWorkLimit) {
this(factory, new RegExp(regexp).toAutomaton());
}

Expand All @@ -99,7 +98,7 @@ public SimplePatternTokenizer(AttributeFactory factory, Automaton dfa) {
throw new IllegalArgumentException("please determinize the incoming automaton first");
}

runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
runDFA = new CharacterRunAutomaton(dfa, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
* <ul>
* <li>"pattern" (required) is the regular expression, according to the syntax described at {@link
* RegExp}
* <li>"maxDeterminizedStates" (optional, default 10000) the limit on total state count for the
* determined automaton computed from the regexp
* <li>"determinizeWorkLimit" (optional, default 10000) the limit on total effort spent to
* determinize the automaton computed from the regexp
* </ul>
*
* <p>The pattern matches the characters to include in a token (not the split characters), and the
Expand Down Expand Up @@ -63,16 +63,16 @@ public class SimplePatternTokenizerFactory extends TokenizerFactory {

public static final String PATTERN = "pattern";
private final Automaton dfa;
private final int maxDeterminizedStates;
private final int determinizeWorkLimit;

/** Creates a new SimplePatternTokenizerFactory */
public SimplePatternTokenizerFactory(Map<String, String> args) {
super(args);
maxDeterminizedStates =
getInt(args, "maxDeterminizedStates", Operations.DEFAULT_MAX_DETERMINIZED_STATES);
determinizeWorkLimit =
getInt(args, "determinizeWorkLimit", Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
dfa =
Operations.determinize(
new RegExp(require(args, PATTERN)).toAutomaton(), maxDeterminizedStates);
new RegExp(require(args, PATTERN)).toAutomaton(), determinizeWorkLimit);
if (args.isEmpty() == false) {
throw new IllegalArgumentException("Unknown parameters: " + args);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -616,7 +616,7 @@ private String randomNonEmptyString(Random random) {
random -> {
return Operations.determinize(
new RegExp(AutomatonTestUtil.randomRegexp(random), RegExp.NONE).toAutomaton(),
Operations.DEFAULT_MAX_DETERMINIZED_STATES);
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
});
put(
PatternTypingFilter.PatternTypingRule[].class,
Expand Down
22 changes: 11 additions & 11 deletions lucene/core/src/java/org/apache/lucene/search/AutomatonQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ public class AutomatonQuery extends MultiTermQuery implements Accountable {
* @param automaton Automaton to run, terms that are accepted are considered a match.
*/
public AutomatonQuery(final Term term, Automaton automaton) {
this(term, automaton, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(term, automaton, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

/**
Expand All @@ -74,12 +74,12 @@ public AutomatonQuery(final Term term, Automaton automaton) {
* @param term Term containing field and possibly some pattern structure. The term text is
* ignored.
* @param automaton Automaton to run, terms that are accepted are considered a match.
* @param maxDeterminizedStates maximum number of states in the resulting automata. If the
* automata would need more than this many states TooComplextToDeterminizeException is thrown.
* Higher number require more space but can process more complex automata.
* @param determinizeWorkLimit maximum effort to spend determinizing the automaton. If the
* automaton would need more than this much effort, TooComplexToDeterminizeException is
* thrown. Higher numbers require more space but can process more complex automata.
*/
public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedStates) {
this(term, automaton, maxDeterminizedStates, false);
public AutomatonQuery(final Term term, Automaton automaton, int determinizeWorkLimit) {
this(term, automaton, determinizeWorkLimit, false);
}

/**
Expand All @@ -88,20 +88,20 @@ public AutomatonQuery(final Term term, Automaton automaton, int maxDeterminizedS
* @param term Term containing field and possibly some pattern structure. The term text is
* ignored.
* @param automaton Automaton to run, terms that are accepted are considered a match.
* @param maxDeterminizedStates maximum number of states in the resulting automata. If the
* automata would need more than this many states TooComplextToDeterminizeException is thrown.
* Higher number require more space but can process more complex automata.
* @param determinizeWorkLimit maximum effort to spend determinizing the automaton. If the
* automaton will need more than this much effort, TooComplexToDeterminizeException is thrown.
* Higher numbers require more space but can process more complex automata.
* @param isBinary if true, this automaton is already binary and will not go through the
* UTF32ToUTF8 conversion
*/
public AutomatonQuery(
final Term term, Automaton automaton, int maxDeterminizedStates, boolean isBinary) {
final Term term, Automaton automaton, int determinizeWorkLimit, boolean isBinary) {
super(term.field());
this.term = term;
this.automaton = automaton;
this.automatonIsBinary = isBinary;
// TODO: we could take isFinite too, to save a bit of CPU in CompiledAutomaton ctor?:
this.compiled = new CompiledAutomaton(automaton, null, true, maxDeterminizedStates, isBinary);
this.compiled = new CompiledAutomaton(automaton, null, true, determinizeWorkLimit, isBinary);

this.ramBytesUsed =
BASE_RAM_BYTES + term.ramBytesUsed() + automaton.ramBytesUsed() + compiled.ramBytesUsed();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public class PrefixQuery extends AutomatonQuery {

/** Constructs a query for terms starting with <code>prefix</code>. */
public PrefixQuery(Term prefix) {
// It's OK to pass unlimited maxDeterminizedStates: the automaton is born small and
// It's OK to pass unlimited determinizeWorkLimit: the automaton is born small and
// determinized:
super(prefix, toAutomaton(prefix.bytes()), Integer.MAX_VALUE, true);
}
Expand Down
46 changes: 26 additions & 20 deletions lucene/core/src/java/org/apache/lucene/search/RegexpQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,20 +69,21 @@ public RegexpQuery(Term term) {
* @param flags optional RegExp features from {@link RegExp}
*/
public RegexpQuery(Term term, int flags) {
this(term, flags, defaultProvider, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(term, flags, defaultProvider, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

/**
* Constructs a query for terms matching <code>term</code>.
*
* @param term regular expression.
* @param flags optional RegExp syntax features from {@link RegExp}
* @param maxDeterminizedStates maximum number of states that compiling the automaton for the
* regexp can result in. Set higher to allow more complex queries and lower to prevent memory
* exhaustion.
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
* regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
* otherwise know what to specify.
*/
public RegexpQuery(Term term, int flags, int maxDeterminizedStates) {
this(term, flags, defaultProvider, maxDeterminizedStates);
public RegexpQuery(Term term, int flags, int determinizeWorkLimit) {
this(term, flags, defaultProvider, determinizeWorkLimit);
}

/**
Expand All @@ -93,10 +94,13 @@ public RegexpQuery(Term term, int flags, int maxDeterminizedStates) {
* regexp can result in. Set higher to allow more complex queries and lower to prevent memory
* exhaustion.
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
* @param maxDeterminizedStates maximum number of states that compiling the
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
* regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
* otherwise know what to specify.
*/
public RegexpQuery(Term term, int syntax_flags, int match_flags, int maxDeterminizedStates) {
this(term, syntax_flags, match_flags, defaultProvider, maxDeterminizedStates);
public RegexpQuery(Term term, int syntax_flags, int match_flags, int determinizeWorkLimit) {
this(term, syntax_flags, match_flags, defaultProvider, determinizeWorkLimit);
}

/**
Expand All @@ -105,13 +109,14 @@ public RegexpQuery(Term term, int syntax_flags, int match_flags, int maxDetermin
* @param term regular expression.
* @param syntax_flags optional RegExp features from {@link RegExp}
* @param provider custom AutomatonProvider for named automata
* @param maxDeterminizedStates maximum number of states that compiling the automaton for the
* regexp can result in. Set higher to allow more complex queries and lower to prevent memory
* exhaustion.
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
* regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
* otherwise know what to specify.
*/
public RegexpQuery(
Term term, int syntax_flags, AutomatonProvider provider, int maxDeterminizedStates) {
this(term, syntax_flags, 0, provider, maxDeterminizedStates);
Term term, int syntax_flags, AutomatonProvider provider, int determinizeWorkLimit) {
this(term, syntax_flags, 0, provider, determinizeWorkLimit);
}

/**
Expand All @@ -121,21 +126,22 @@ public RegexpQuery(
* @param syntax_flags optional RegExp features from {@link RegExp}
* @param match_flags boolean 'or' of match behavior options such as case insensitivity
* @param provider custom AutomatonProvider for named automata
* @param maxDeterminizedStates maximum number of states that compiling the automaton for the
* regexp can result in. Set higher to allow more complex queries and lower to prevent memory
* exhaustion.
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
* regexp. Set higher to allow more complex queries and lower to prevent memory exhaustion.
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
* otherwise know what to specify.
*/
public RegexpQuery(
Term term,
int syntax_flags,
int match_flags,
AutomatonProvider provider,
int maxDeterminizedStates) {
int determinizeWorkLimit) {
super(
term,
new RegExp(term.text(), syntax_flags, match_flags)
.toAutomaton(provider, maxDeterminizedStates),
maxDeterminizedStates);
.toAutomaton(provider, determinizeWorkLimit),
determinizeWorkLimit);
}

/** Returns the regexp of this query wrapped in a Term. */
Expand Down
11 changes: 6 additions & 5 deletions lucene/core/src/java/org/apache/lucene/search/WildcardQuery.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,13 @@ public WildcardQuery(Term term) {
/**
* Constructs a query for terms matching <code>term</code>.
*
* @param maxDeterminizedStates maximum number of states in the resulting automata. If the
* automata would need more than this many states TooComplextToDeterminizeException is thrown.
* Higher number require more space but can process more complex automata.
* @param determinizeWorkLimit maximum effort to spend while compiling the automaton from this
* wildcard. Set higher to allow more complex queries and lower to prevent memory exhaustion.
* Use {@link Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't
* otherwise know what to specify.
*/
public WildcardQuery(Term term, int maxDeterminizedStates) {
super(term, toAutomaton(term), maxDeterminizedStates);
public WildcardQuery(Term term, int determinizeWorkLimit) {
super(term, toAutomaton(term), determinizeWorkLimit);
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,12 @@ public class ByteRunAutomaton extends RunAutomaton {

/** Converts incoming automaton to byte-based (UTF32ToUTF8) first */
public ByteRunAutomaton(Automaton a) {
this(a, false, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(a, false, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

/** expert: if isBinary is true, the input is already byte-based */
public ByteRunAutomaton(Automaton a, boolean isBinary, int maxDeterminizedStates) {
super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, maxDeterminizedStates);
public ByteRunAutomaton(Automaton a, boolean isBinary, int determinizeWorkLimit) {
super(isBinary ? a : new UTF32ToUTF8().convert(a), 256, determinizeWorkLimit);
}

/** Returns true if the given byte array is accepted by this automaton */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,22 @@

/** Automaton representation for matching char[]. */
public class CharacterRunAutomaton extends RunAutomaton {
/** Construct with a default number of maxDeterminizedStates. */
/** Construct with a default number of determinizeWorkLimit. */
public CharacterRunAutomaton(Automaton a) {
this(a, Operations.DEFAULT_MAX_DETERMINIZED_STATES);
this(a, Operations.DEFAULT_DETERMINIZE_WORK_LIMIT);
}

/**
* Construct specifying maxDeterminizedStates.
* Constructor specifying determinizeWorkLimit.
*
* @param a Automaton to match
* @param maxDeterminizedStates maximum number of states that the automaton can have once
* determinized. If more states are required to determinize it then a
* TooComplexToDeterminizeException is thrown.
* @param determinizeWorkLimit maximum effort to spend determinizing the automataon. If more
* effort is required then a TooComplexToDeterminizeException is thrown. Use {@link
* Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't otherwise know
* what to specify.
*/
public CharacterRunAutomaton(Automaton a, int maxDeterminizedStates) {
super(a, Character.MAX_CODE_POINT + 1, maxDeterminizedStates);
public CharacterRunAutomaton(Automaton a, int determinizeWorkLimit) {
super(a, Character.MAX_CODE_POINT + 1, determinizeWorkLimit);
}

/** Returns true if the given string is accepted by this automaton. */
Expand Down
Loading

0 comments on commit c4cf7aa

Please sign in to comment.