Skip to content

Commit

Permalink
Enhance ScriptUtils: fix parser (#7646)
Browse files Browse the repository at this point in the history
Co-authored-by: Eddú Meléndez <eddu.melendez@gmail.com>
Co-authored-by: Kevin Wittek <kiview@users.noreply.github.com>
  • Loading branch information
3 people committed Oct 30, 2023
1 parent 3422ebb commit 9cfa166
Show file tree
Hide file tree
Showing 4 changed files with 511 additions and 202 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
package org.testcontainers.ext;

import lombok.Getter;
import lombok.RequiredArgsConstructor;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Rough lexical parser for SQL scripts.
*/
@RequiredArgsConstructor
class ScriptScanner {

private final String resource;

private final String script;

private final String separator;

private final String commentPrefix;

private final String blockCommentStartDelimiter;

private final String blockCommentEndDelimiter;

private final Pattern eol = Pattern.compile("[\n\r]+");

private final Pattern whitespace = Pattern.compile("\\s+");

private final Pattern identifier = Pattern.compile("[a-z][a-z0-9_]*", Pattern.CASE_INSENSITIVE);

private final Pattern singleQuotedString = Pattern.compile("'(\\\\'|[^'])*'");

private final Pattern ansiQuotedString = Pattern.compile("\"(\\\\\"|[^\"])*\"");

private final Pattern dollarQuotedStringDelimiter = Pattern.compile("\\$\\w*\\$");

private int offset;

@Getter
private String currentMatch;

private boolean matches(String substring) {
if (script.startsWith(substring, offset)) {
currentMatch = substring;
offset += currentMatch.length();
return true;
} else {
currentMatch = "";
return false;
}
}

private boolean matches(Pattern regexp) {
Matcher m = regexp.matcher(script);
if (m.find(offset) && m.start() == offset) {
currentMatch = m.group();
offset = m.end();
return true;
} else {
currentMatch = "";
return false;
}
}

private boolean matchesSingleLineComment() {
/* Matches from commentPrefix to the EOL or end of script */
if (matches(commentPrefix)) {
Matcher m = eol.matcher(script);
if (m.find(offset)) {
currentMatch = commentPrefix + script.substring(offset, m.end());
offset = m.end();
} else {
currentMatch = commentPrefix + script.substring(offset);
offset = script.length();
}
return true;
}
return false;
}

private boolean matchesMultilineComment() {
/* Matches from blockCommentStartDelimiter to the next blockCommentEndDelimiter.
* Error, if blockCommentEndDelimiter is not found. */
if (matches(blockCommentStartDelimiter)) {
int end = script.indexOf(blockCommentEndDelimiter, offset);
if (end < 0) {
throw new ScriptUtils.ScriptParseException(
String.format("Missing block comment end delimiter [%s].", blockCommentEndDelimiter),
resource
);
}
end += blockCommentEndDelimiter.length();
currentMatch = blockCommentStartDelimiter + script.substring(offset, end);
offset = end;
return true;
}
return false;
}

private boolean matchesDollarQuotedString() {
//Matches $<tag>$ .... $<tag>$
if (matches(dollarQuotedStringDelimiter)) {
String delimiter = currentMatch;
int end = script.indexOf(delimiter, offset);
if (end < 0) {
throw new ScriptUtils.ScriptParseException(
String.format("Unclosed dollar quoted string [%s].", delimiter),
resource
);
}
end += delimiter.length();
currentMatch = delimiter + script.substring(offset, end);
offset = end;
return true;
}
return false;
}

Lexem next() {
if (offset < script.length()) {
if (matches(separator)) {
return Lexem.SEPARATOR;
} else if (matchesSingleLineComment() || matchesMultilineComment()) {
return Lexem.COMMENT;
} else if (matches(singleQuotedString) || matches(ansiQuotedString) || matchesDollarQuotedString()) {
return Lexem.QUOTED_STRING;
} else if (matches(identifier)) {
return Lexem.IDENTIFIER;
} else if (matches(whitespace)) {
return Lexem.WHITESPACE;
} else {
currentMatch = String.valueOf(script.charAt(offset++));
return Lexem.OTHER;
}
} else {
return Lexem.EOF;
}
}

enum Lexem {
SEPARATOR,
COMMENT,
QUOTED_STRING,
WHITESPACE,
IDENTIFIER,
OTHER,
EOF,
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
package org.testcontainers.ext;

import lombok.RequiredArgsConstructor;
import org.apache.commons.lang3.StringUtils;
import org.testcontainers.ext.ScriptScanner.Lexem;

import java.util.List;

/**
* Performs splitting of an SQL script into statements including
* basic clean-up.
*/
@RequiredArgsConstructor
class ScriptSplitter {

private final ScriptScanner scanner;

private final List<String> statements;

private final StringBuilder sb = new StringBuilder();

/**
* Standard parsing:
* 1. Remove comments
* 2. Shrink whitespace and eols
* 3. Split on separator
*/
void split() {
Lexem l;
while ((l = scanner.next()) != Lexem.EOF) {
switch (l) {
case SEPARATOR:
flushStringBuilder();
break;
case COMMENT:
//skip
break;
case WHITESPACE:
if (!sb.toString().endsWith(" ")) {
sb.append(' ');
}
break;
case IDENTIFIER:
appendMatch();
if ("begin".equalsIgnoreCase(scanner.getCurrentMatch())) {
compoundStatement(false);
flushStringBuilder();
}
break;
default:
appendMatch();
}
}
flushStringBuilder();
}

/**
* Compound statement ('create procedure') mode:
* 1. Do not remove comments
* 2. Do not shrink whitespace
* 3. Do not split on separators
* 3. This mode can be recursive
*/
private void compoundStatement(boolean recursive) {
Lexem l;
while ((l = scanner.next()) != Lexem.EOF) {
appendMatch();
if (Lexem.IDENTIFIER.equals(l)) {
if ("begin".equalsIgnoreCase(scanner.getCurrentMatch())) {
compoundStatement(true);
} else if ("end".equalsIgnoreCase(scanner.getCurrentMatch())) {
if (endOfBlock(recursive)) {
return;
}
}
}
}
flushStringBuilder();
}

private boolean endOfBlock(boolean recursive) {
Lexem l;
StringBuilder temporary = new StringBuilder();
while ((l = scanner.next()) != Lexem.EOF) {
switch (l) {
case COMMENT:
case WHITESPACE:
temporary.append(scanner.getCurrentMatch());
break;
case SEPARATOR:
//Only whitespace and comments preceded the separator: true end of block
//If it's an internal block, append everything
if (recursive) {
sb.append(temporary);
appendMatch();
}
return true;
default:
// Semicolon is not recognized as separator: this means that a custom
// separator is used. Still, 'END;' should be a valid end of block
if (";".equals(scanner.getCurrentMatch())) {
if (recursive) {
sb.append(temporary);
appendMatch();
}
return true;
}
sb.append(temporary);
appendMatch();
return false;
}
}
return true;
}

private void appendMatch() {
sb.append(scanner.getCurrentMatch());
}

private void flushStringBuilder() {
final String s = sb.toString().trim();
if (StringUtils.isNotEmpty(s)) {
statements.add(s);
}
sb.setLength(0);
}
}
Loading

0 comments on commit 9cfa166

Please sign in to comment.