From f31046835f297b40cd923ae5311211bfe4c73db2 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 27 Sep 2024 12:17:22 +0200 Subject: [PATCH] Make MetaData multi-valued to preserve values of repeating WARC and HTTP headers --- .../java/org/archive/resource/MetaData.java | 67 ++++++- .../org/archive/resource/MetaDataTest.java | 186 ++++++++++++++++++ .../archive/format/warc/mutliple-headers.warc | 47 +++++ 3 files changed, 297 insertions(+), 3 deletions(-) create mode 100644 src/test/java/org/archive/resource/MetaDataTest.java create mode 100644 src/test/resources/org/archive/format/warc/mutliple-headers.warc diff --git a/src/main/java/org/archive/resource/MetaData.java b/src/main/java/org/archive/resource/MetaData.java index 1237a51c..05c0ee06 100755 --- a/src/main/java/org/archive/resource/MetaData.java +++ b/src/main/java/org/archive/resource/MetaData.java @@ -7,6 +7,15 @@ import com.github.openjson.JSONObject; import com.github.openjson.JSONTokener; +/** + * A nested structure of {@linkplain JSONObject}s to hold the metadata of + * content in nested containers, e.g. a HTML page as payload of a HTTP response + * in a WARC record stored as gzip "member". + * + * MetaData is multi-valued: if a second value is added under the same "key" + * ("name"), both values are stored in a {@linkplain JSONArray} as value. This + * allows to hold all values of repeating WARC or HTTP headers. + */ public class MetaData extends JSONObject { private static final Logger LOG = @@ -67,6 +76,18 @@ public int getInt(String key) { } } + @Override + public int optInt(String key, int defaultValue) { + if (has(key)) { + try { + return super.getInt(key); + } catch(JSONException e) { + LOG.severe(e.getMessage()); + } + } + return defaultValue; + } + @Override public long getLong(String key) { try { @@ -77,6 +98,18 @@ public long getLong(String key) { } } + @Override + public long optLong(String key, long defaultValue) { + if (has(key)) { + try { + return super.getLong(key); + } catch(JSONException e) { + LOG.severe(e.getMessage()); + } + } + return defaultValue; + } + @Override public String getString(String key) { try { @@ -102,9 +135,37 @@ public void setTopMetaData(MetaData topMetaData) { this.topMetaData = topMetaData; } + @Override + public JSONObject put(String name, boolean value) throws JSONException { + return super.accumulate(name, value); + } + + @Override + public JSONObject put(String name, double value) throws JSONException { + return super.accumulate(name, value); + } + + @Override + public JSONObject put(String name, int value) throws JSONException { + return super.accumulate(name, value); + } + + @Override + public JSONObject put(String name, long value) throws JSONException { + return super.accumulate(name, value); + } + + @Override + public JSONObject put(String key, Object value) { + if (has(key)) { + return super.accumulate(key, value); + } + return super.put(key, value); + } + public JSONObject putString(String key, String val) { try { - return super.put(key,val); + return super.accumulate(key,val); } catch(JSONException e) { LOG.severe(e.getMessage()); return null; @@ -113,7 +174,7 @@ public JSONObject putString(String key, String val) { public JSONObject putLong(String key, long val) { try { - return super.put(key,String.valueOf(val)); + return super.accumulate(key,String.valueOf(val)); } catch(JSONException e) { LOG.severe(e.getMessage()); return null; @@ -122,7 +183,7 @@ public JSONObject putLong(String key, long val) { public JSONObject putBoolean(String key, boolean val) { try { - return super.put(key,val); + return super.accumulate(key,val); } catch(JSONException e) { LOG.severe(e.getMessage()); return null; diff --git a/src/test/java/org/archive/resource/MetaDataTest.java b/src/test/java/org/archive/resource/MetaDataTest.java new file mode 100644 index 00000000..64ef7b5b --- /dev/null +++ b/src/test/java/org/archive/resource/MetaDataTest.java @@ -0,0 +1,186 @@ +package org.archive.resource; + +import java.io.IOException; + +import org.archive.extract.ExtractingResourceFactoryMapper; +import org.archive.extract.ExtractingResourceProducer; +import org.archive.extract.ProducerUtils; +import org.archive.extract.ResourceFactoryMapper; +import org.archive.format.json.JSONUtils; + +import com.github.openjson.JSONArray; +import com.github.openjson.JSONObject; + +import junit.framework.TestCase; + +public class MetaDataTest extends TestCase { + + private static String[] testFilePaths = { + "src/test/resources/org/archive/format/warc/IAH-urls-wget.warc", + "src/test/resources/org/archive/format/warc/mutliple-headers.warc" + }; + + private static JSONObject obj = new JSONObject("{\"foo\":\"bar\",\"hello\":\"world\"}"); + + private MetaData putMetaData(MetaData m) { + m.putBoolean("boolean-1", false); + m.putBoolean("boolean-2", true); + m.put("boolean-3", true); + m.put("boolean-1", true); // append + + m.put("double-1", 0.5d); + m.put("double-2", 2.5d); + m.put("double-3", 3.5d); + m.put("double-1", 1.5d); // append + + m.put("int-1", 0); + m.put("int-2", 2); + m.put("int-3", 3); + m.put("int-1", 1); // append + + // choose JSON "numbers" which are forced into a Java long (too big for an integer) + m.putLong("long-1", 0xffffffffL + 0L); + m.putLong("long-2", 0xffffffffL + 2L); + m.put("long-3", 0xffffffffL + 3L); + m.put("long-1", 0xffffffffL + 1L); // append + + m.putString("string-1", "0"); + m.putString("string-2", "2"); + m.put("string-3", "3"); + m.put("string-1", "1"); // append + + m.putOpt("obj-1", obj); + m.put("obj-1", obj); // append + m.put("obj-2", obj); + m.putOpt("obj-2", null); // do nothing because value is null + + return m; + } + + private void verifyMultiValuedMetaData(MetaData m) { + // boolean + assertEquals(JSONArray.class, m.get("boolean-1").getClass()); + assertEquals(false, ((JSONArray) m.get("boolean-1")).getBoolean(0)); + assertEquals(true, ((JSONArray) m.get("boolean-1")).getBoolean(1)); + assertEquals(true, m.getBoolean("boolean-2")); + assertEquals(true, m.getBoolean("boolean-3")); + assertEquals(Boolean.class, m.get("boolean-3").getClass()); + assertEquals(true, m.optBoolean("boolean-3", false)); + assertEquals(false, m.optBoolean("boolean-99", false)); + + // double + assertEquals(JSONArray.class, m.get("double-1").getClass()); + assertEquals(0.5d, ((JSONArray) m.get("double-1")).getDouble(0)); + assertEquals(1.5d, ((JSONArray) m.get("double-1")).getDouble(1)); + assertEquals(2.5d, m.getDouble("double-2")); + assertEquals(3.5d, m.getDouble("double-3")); + assertEquals(Double.class, m.get("double-3").getClass()); + assertEquals(3.5d, m.optDouble("double-3")); + assertEquals(99.5d, m.optDouble("double-99", 99.5d)); + + // int + assertEquals(JSONArray.class, m.get("int-1").getClass()); + assertEquals(0, ((JSONArray) m.get("int-1")).getInt(0)); + assertEquals(1, ((JSONArray) m.get("int-1")).getInt(1)); + assertEquals(2, m.getInt("int-2")); + assertEquals(3, m.getInt("int-3")); + assertEquals(Integer.class, m.get("int-3").getClass()); + assertEquals(3, m.optInt("int-3")); + assertEquals(99, m.optInt("int-99", 99)); + + // long + assertEquals(JSONArray.class, m.get("long-1").getClass()); + assertEquals(0xffffffffL + 0L, ((JSONArray) m.get("long-1")).getLong(0)); + assertEquals(0xffffffffL + 1L, ((JSONArray) m.get("long-1")).getLong(1)); + assertEquals(0xffffffffL + 2L, m.getLong("long-2")); + assertEquals(0xffffffffL + 3L, m.getLong("long-3")); + assertEquals(Long.class, m.get("long-3").getClass()); + assertEquals(0xffffffffL + 3L, m.optLong("long-3")); + assertEquals(0xffffffffL + 99L, m.optLong("long-99", 0xffffffffL + 99L)); + + // String + assertEquals(JSONArray.class, m.get("string-1").getClass()); + assertEquals("0", ((JSONArray) m.get("string-1")).getString(0)); + assertEquals("1", ((JSONArray) m.get("string-1")).getString(1)); + assertEquals("2", m.getString("string-2")); + assertEquals("3", m.getString("string-3")); + assertEquals(String.class, m.get("string-3").getClass()); + assertEquals("3", m.optString("string-3")); + assertEquals("99", m.optString("string-99", "99")); + + // Object + assertEquals(JSONArray.class, m.get("obj-1").getClass()); + assertEquals(JSONObject.class, ((JSONArray) m.get("obj-1")).get(0).getClass()); + assertEquals(JSONObject.class, ((JSONArray) m.get("obj-1")).get(1).getClass()); + assertEquals("bar", ((JSONObject) ((JSONArray) m.get("obj-1")).get(0)).get("foo")); + assertEquals("world", ((JSONObject) ((JSONArray) m.get("obj-1")).get(0)).get("hello")); + assertEquals("bar", ((JSONObject) ((JSONArray) m.get("obj-1")).get(1)).get("foo")); + assertEquals("world", ((JSONObject) ((JSONArray) m.get("obj-1")).get(1)).get("hello")); + assertEquals(JSONObject.class, m.get("obj-2").getClass()); + assertEquals("bar", ((JSONObject) m.get("obj-2")).get("foo")); + assertEquals("world", ((JSONObject) m.get("obj-2")).get("hello")); + } + + public void testMultiValued() { + MetaData m = new MetaData(); + m = putMetaData(m); + verifyMultiValuedMetaData(m); + + // test (de)serialization + m = new MetaData(m.toString(2)); + verifyMultiValuedMetaData(m); + } + + private MetaData readNextWARCResponseAsMetaData(String filePath) throws IOException, ResourceParseException { + ResourceProducer producer = ProducerUtils.getProducer(filePath); + ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper(); + ExtractingResourceProducer exProducer = new ExtractingResourceProducer(producer, mapper); + Resource r = exProducer.getNext(); + while (r != null) { + MetaData top = r.getMetaData().getTopMetaData(); + JSONObject warcHeaders = JSONUtils.extractObject(top, "Envelope.WARC-Header-Metadata"); + if (warcHeaders.has("WARC-Type") && "response".equals(warcHeaders.getString("WARC-Type"))) { + return top; + } + r = exProducer.getNext(); + } + return null; + } + + /** + * Verify that in the legacy test file all WARC and HTTP headers are + * single-valued, i.e. {@linkplain String}s. + */ + public void testSingleHeaders() throws IOException, ResourceParseException { + MetaData m = readNextWARCResponseAsMetaData(testFilePaths[0]); + + JSONObject warcHeaders = JSONUtils.extractObject(m, "Envelope.WARC-Header-Metadata"); + JSONObject httpHeaders = JSONUtils.extractObject(m, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"); + + for (Object header : warcHeaders.keySet()) { + assertEquals(String.class, warcHeaders.get(header.toString()).getClass()); + } + + for (Object header : httpHeaders.keySet()) { + assertEquals(String.class, httpHeaders.get(header.toString()).getClass()); + } + } + + public void testMultipleHeaders() throws IOException, ResourceParseException { + MetaData m = readNextWARCResponseAsMetaData(testFilePaths[1]); + + JSONObject warcHeaders = JSONUtils.extractObject(m, "Envelope.WARC-Header-Metadata"); + JSONObject httpHeaders = JSONUtils.extractObject(m, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers"); + + assertEquals("https://www.example.com/index.html/", warcHeaders.getString("WARC-Target-URI")); + assertEquals(JSONArray.class, warcHeaders.get("WARC-Protocol").getClass()); + assertEquals(2, ((JSONArray) warcHeaders.get("WARC-Protocol")).length()); + assertEquals("h2", ((JSONArray) warcHeaders.get("WARC-Protocol")).get(0)); + + assertEquals("108", httpHeaders.getString("Content-Length")); + assertEquals(JSONArray.class, httpHeaders.get("x-powered-by").getClass()); + assertEquals(2, ((JSONArray) httpHeaders.get("x-powered-by")).length()); + assertEquals("PHP/8.3.11", ((JSONArray) httpHeaders.get("x-powered-by")).get(0)); + assertEquals("PleskLin", ((JSONArray) httpHeaders.get("x-powered-by")).get(1)); + } +} diff --git a/src/test/resources/org/archive/format/warc/mutliple-headers.warc b/src/test/resources/org/archive/format/warc/mutliple-headers.warc new file mode 100644 index 00000000..861f67f1 --- /dev/null +++ b/src/test/resources/org/archive/format/warc/mutliple-headers.warc @@ -0,0 +1,47 @@ +WARC/1.0 +WARC-Type: response +WARC-Date: 2024-09-27T10:47:02Z +WARC-Record-ID: +Content-Length: 971 +Content-Type: application/http; msgtype=response +WARC-Warcinfo-ID: +WARC-Concurrent-To: +WARC-IP-Address: 172.67.184.105 +WARC-Target-URI: https://www.example.com/index.html/ +WARC-Protocol: h2 +WARC-Protocol: tls/1.3 +WARC-Cipher-Suite: TLS_AES_256_GCM_SHA384 +WARC-Payload-Digest: sha1:70FB81039DCE25916E0E0CB48CF6662E3F27FFFC +WARC-Block-Digest: sha1:80573371A8271BE6B3AA26FD9DB72E9AD9F316D9 +WARC-Identified-Payload-Type: text/html + +HTTP/1.1 200 +date: Fri, 27 Sep 2024 10:47:02 GMT +content-type: text/html; charset=UTF-8 +x-powered-by: PHP/8.3.11 +x-powered-by: PleskLin +x-pingback: https://www.example.com/xmlrpc.php +link: ; rel="https://api.w.org/" +link: ; rel="alternate"; title="JSON"; type="application/json" +link: ; rel=shortlink +x-litespeed-cache: miss +vary: Accept-Encoding +x-turbo-charged-by: LiteSpeed +cf-cache-status: DYNAMIC +report-to: {"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=XXtestYY"}],"group":"cf-nel","max_age":604800} +nel: {"success_fraction":0,"report_to":"cf-nel","max_age":604800} +server: cloudflare +cf-ray: 8bf61e4afb9e7f9e-IAD +X-Crawler-content-encoding: br +alt-svc: h3=":443"; ma=86400 +Content-Length: 108 + + + + + + Test + + + +