From 9771f23dcd204d0fed207505c485be2a73d70694 Mon Sep 17 00:00:00 2001 From: Alex Osborne Date: Fri, 28 Jun 2024 15:32:16 +0900 Subject: [PATCH] Add lenient WARC parsing mode Fixes #26 and #86 --- src/org/netpreserve/jwarc/WarcParser.java | 380 ++++++++++++++++-- src/org/netpreserve/jwarc/WarcParser.rl | 41 ++ src/org/netpreserve/jwarc/WarcReader.java | 15 + src/org/netpreserve/jwarc/cdx/CdxWriter.java | 1 + .../org/netpreserve/jwarc/WarcParserTest.java | 16 + 5 files changed, 413 insertions(+), 40 deletions(-) diff --git a/src/org/netpreserve/jwarc/WarcParser.java b/src/org/netpreserve/jwarc/WarcParser.java index 0ae73d4..dc3a5d4 100644 --- a/src/org/netpreserve/jwarc/WarcParser.java +++ b/src/org/netpreserve/jwarc/WarcParser.java @@ -21,7 +21,7 @@ import static java.nio.charset.StandardCharsets.US_ASCII; -// line 169 "WarcParser.rl" +// line 185 "WarcParser.rl" /** @@ -71,6 +71,30 @@ public void reset() { } } + /** + * Sets the lenient mode for the WarcParser. + *

+ * When enabled, this causes the parser to follow the specification less strictly, + * allowing reading of non-compliant records by: + *