Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Regex timestamp extract #1727

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
23 changes: 22 additions & 1 deletion iped-app/resources/config/conf/ExternalParsers.xml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,10 @@
</parser>
-->

<!-- Regex can be used to parse and extract metadata from external parser output.
Regex subgroups named as key and value are used to save the extracted metadata name and value, respectively.
See PrefetchParser example bellow, that extracts timestamps from sccainfo result.
-->
<parser>
<name>PrefetchParser</name>
<win-tool-path>tools/sccainfo/</win-tool-path>
Expand All @@ -79,6 +83,10 @@
<mime-type>application/x-prefetch</mime-type>
</mime-types>
<output-charset>ISO-8859-1</output-charset>
<metadata>
<match>(?&lt;key&gt;[A-Za-z0-9\s]+):\s*(\d+\s*:\s*)?(?&lt;value&gt;((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s(0[1-9]|[1-2][0-9]|3[0-1]),?\s\d{4}\s([0-1][0-9]|2[0-3]):([0-5][0-9]):(([0-5][0-9])(\.\d*)?)\sUTC))</match>
</metadata>
<extractAsTextContent/><!-- if metadata tag is defined, this tag must also be specified if the integral cmd output should be added as item textual content -->
</parser>

<parser>
Expand All @@ -94,6 +102,11 @@
</mime-types>
<output-charset>UTF-8</output-charset>
<firstLinesToIgnore>3</firstLinesToIgnore>

<metadata>
<match key="deleted">(\d{4}[:-](0[1-9]|1[0-2])[:-](0[1-9]|[1-2][0-9]|3[0-1])(\s|T)([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])Z?)</match>
</metadata>
<extractAsTextContent/><!-- if metadata tag is defined, this tag must also be specified if the integral cmd output should be added as item textual content -->
</parser>

<parser>
Expand All @@ -109,6 +122,10 @@
</mime-types>
<output-charset>UTF-8</output-charset>
<firstLinesToIgnore>3</firstLinesToIgnore>
<metadata>
<match key="deleted">(\d{4}[:-](0[1-9]|1[0-2])[:-](0[1-9]|[1-2][0-9]|3[0-1])(\s|T)([0-1][0-9]|2[0-3]):([0-5][0-9]):([0-5][0-9])Z?)</match>
</metadata>
<extractAsTextContent/><!-- if metadata tag is defined, this tag must also be specified if the integral cmd output should be added as item textual content -->
</parser>

<parser>
Expand Down Expand Up @@ -151,6 +168,10 @@
<mime-type>application/x-superfetch</mime-type>
</mime-types>
<output-charset>ISO-8859-1</output-charset>
</parser>
<metadata>
<match>(?&lt;key&gt;[A-Za-z0-9\s]+):\s*(\d+\s*:\s*)?(?&lt;value&gt;((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s(0[1-9]|[1-2][0-9]|3[0-1]),?\s\d{4}\s([0-1][0-9]|2[0-3]):([0-5][0-9]):(([0-5][0-9])(\.\d*)?)\sUTC))</match>
</metadata>
<extractAsTextContent/><!-- if metadata tag is defined, this tag must also be specified if the integral cmd output should be added as item textual content -->
</parser>

</external-parsers>
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,12 @@ public class ExternalParser extends AbstractParser {

private static Logger LOGGER;

public final static int IGNORE = 0;
public final static int APPEND = 1;
public final static int SPLITSUBITEM = 2;

int outputExtractionScheme = APPEND;// default is to ignore

public static final String EXTERNAL_PARSERS_ROOT = "iped.extParsers.root";

/**
Expand All @@ -88,11 +94,11 @@ public interface LineConsumer extends Serializable {
*/
LineConsumer NULL = new LineConsumer() {
/**
*
*/
private static final long serialVersionUID = 1L;
*
*/
private static final long serialVersionUID = 1L;

@Override
@Override
public void consume(String line) {
// ignores
}
Expand Down Expand Up @@ -210,6 +216,15 @@ public void setMetadataExtractionPatterns(Map<Pattern, String> patterns) {
this.metadataPatterns = patterns;
}

/**
* Sets the map of regular expression patterns and Metadata keys. Any matching
* patterns will have the matching metadata entries set. Set this to null to
* disable Metadata extraction.
*/
public void setOutputExtractionScheme(int scheme) {
this.outputExtractionScheme = scheme;
}

/**
* Executes the configured external command and passes the given document stream
* as a simple XHTML document to the given SAX content handler. Metadata is only
Expand Down Expand Up @@ -255,7 +270,7 @@ private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata m
boolean inputToStdIn = true;
boolean outputFromStdOut = true;
boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());


File outputFile = tmp.createTemporaryFile();

// Build our command
Expand Down Expand Up @@ -330,15 +345,17 @@ private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata m

process.waitFor();

try (InputStream is = new FileInputStream(outputFile)) {
if (hasPatterns) {
if (hasPatterns) {
try (InputStream is = new FileInputStream(outputFile)) {
extractMetadata(is, metadata);
} else {
}
}
if (outputExtractionScheme == APPEND) {
try (InputStream is = new FileInputStream(outputFile)) {
File tmpFile = inputToStdIn ? null : stream.getFile();
extractOutput(is, xhtml, metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY), tmpFile);
}
}

} catch (InterruptedException e) {
LOGGER.warn(
parserName + " interrupted while processing " + metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY)
Expand All @@ -360,8 +377,8 @@ private Thread extractOutputInBackground(final InputStream stream, final File ou
public void run() {
try {
Files.copy(stream, outFile.toPath(), StandardCopyOption.REPLACE_EXISTING);

} catch (IOException e) {
e.printStackTrace();
}
}
};
Expand Down Expand Up @@ -515,7 +532,20 @@ private void extractMetadata(final InputStream stream, final Metadata metadata)
if (metadataPatterns.get(p) != null && !metadataPatterns.get(p).equals("")) {
metadata.add(metadataPatterns.get(p), m.group(1));
} else {
metadata.add(m.group(1), m.group(2));
String propertyName;
String value;
try {
propertyName = m.group("key");
propertyName=propertyName.replace(" ", "").replace("\t", "").replace("\n", "").replace("\r", "");
}catch (IllegalArgumentException iae) {
propertyName = m.group(1);
}
try {
value = m.group("value");
}catch (IllegalArgumentException iae) {
value = m.group(2);
}
metadata.add(parserName+":"+propertyName, value);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ public static List<ExternalParser> read(Element element) throws TikaException, I
private static ExternalParser readParser(Element parserDef) throws TikaException {
ExternalParser parser = new ExternalParser();

String appendStr = null;
NodeList children = parserDef.getChildNodes();
Element checkElement = null;
for (int i = 0; i < children.getLength(); i++) {
Expand All @@ -122,6 +123,8 @@ private static ExternalParser readParser(Element parserDef) throws TikaException
parser.setSupportedTypes(readMimeTypes(child));
} else if (child.getTagName().equals(METADATA_TAG)) {
parser.setMetadataExtractionPatterns(readMetadataPatterns(child));
} else if (child.getTagName().equals(APPENDCONTENT_TAG)) {
appendStr = getString(child);
} else if (child.getTagName().equals(PARSER_NAME_TAG)) {
parser.setParserName(getString(child));
} else if (child.getTagName().equals(WIN_TOOL_PATH)) {
Expand All @@ -135,6 +138,17 @@ private static ExternalParser readParser(Element parserDef) throws TikaException
}
}
}
if (appendStr != null) {
parser.setOutputExtractionScheme(ExternalParser.APPEND);
if (!"".equals(appendStr.trim())) {
parser.setOutputExtractionScheme(Integer.parseInt(appendStr));
}
} else {
if (parser.getMetadataExtractionPatterns() != null && parser.getMetadataExtractionPatterns().size() > 0) {
parser.setOutputExtractionScheme(ExternalParser.IGNORE);// if metadata extraction is defined, default
// behavior is to ignore text content
}
}
if (checkElement != null) {
String tool = parser.getCommand()[0].split(" ")[0];
synchronized (lock) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ public interface ExternalParsersConfigReaderMetKeys {

String METADATA_TAG = "metadata";

String APPENDCONTENT_TAG = "extractAsTextContent";

String METADATA_MATCH_TAG = "match";

String METADATA_KEY_ATTR = "key";
Expand Down
44 changes: 42 additions & 2 deletions iped-utils/src/main/java/iped/utils/DateUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ public class DateUtil {

private static final DateUtil INSTANCE = new DateUtil();

private static Pattern pattern;

private static DateFormat createDateFormat(String format, TimeZone timezone) {
final SimpleDateFormat sdf = new SimpleDateFormat(format, new DateFormatSymbols(Locale.US));
if (timezone != null) {
Expand All @@ -38,6 +40,8 @@ private static DateFormat createDateFormat(String format, TimeZone timezone) {
*/
private final List<DateFormat> iso8601InputFormats = loadDateFormats();

private final DateFormat monthFormat = new SimpleDateFormat("MMM dd, YYYY HH:mm:ssZ");

private List<DateFormat> loadDateFormats() {
List<DateFormat> dateFormats = new ArrayList<>();
// yyyy-mm-ddThh...
Expand Down Expand Up @@ -84,6 +88,26 @@ public Date tryToParse(String dateString) {
return null;
}

/**
* Tries to parse the date string; returns null if no parse was possible.
*
* This is not thread safe! Wrap in synchronized or create new {@link DateUtils}
* for each class.
*
* @param dateString
* @return
*/
public Date tryToParseExt(String dateString) {
try {
dateString = dateString.replaceAll("\\.\\d*\\s", "");//removes milliseconds
return monthFormat.parse(dateString);
} catch (java.text.ParseException e) {

}

return null;
}

/**
* Thread-safe method internally synchronized
*
Expand All @@ -95,8 +119,15 @@ public static Date tryToParseDate(String val) {
synchronized (INSTANCE) {
return INSTANCE.tryToParse(val);
}
} else
return null;
} else {
if(val.length()>=21 && Character.isDigit(val.charAt(4)) && val.charAt(6)==',') {
synchronized (INSTANCE) {
return INSTANCE.tryToParseExt(val);
}
}else {
return null;
}
}
}

// Thread local variable
Expand All @@ -117,4 +148,13 @@ public static Date stringToDate(String date) throws ParseException {
return threadLocal.get().parse(date);
}

public static Pattern getDateStrPattern(){
if(pattern==null) {
String patternStr = "(?<ISO>\\d{4}[:-](0[1-9]|1[0-2])[:-](0[1-9]|[1-2][0-9]|3[0-1])(\\s|T)([0-1][0-9]|2[0-3])\\:([0-5][0-9])\\:([0-5][0-9])Z?)"
+ "|((Mon|Tue|Wed|Thu|Fri|Sat|Sun)\\s(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\\s(0[1-9]|[1-2][0-9]|3[0-1])\\s([0-1][0-9]|2[0-3])\\:([0-5][0-9])\\:(([0-5][0-9])Z?)\\s\\d{4})";
pattern = Pattern.compile(patternStr);
}
return pattern;
}

}