Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make MetaData multi-valued to preserve values of repeating WARC and HTTP headers #38

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 64 additions & 3 deletions src/main/java/org/archive/resource/MetaData.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@
import com.github.openjson.JSONObject;
import com.github.openjson.JSONTokener;

/**
* A nested structure of {@linkplain JSONObject}s to hold the metadata of
* content in nested containers, e.g. a HTML page as payload of a HTTP response
* in a WARC record stored as gzip "member".
*
* MetaData is multi-valued: if a second value is added under the same "key"
* ("name"), both values are stored in a {@linkplain JSONArray} as value. This
* allows to hold all values of repeating WARC or HTTP headers.
*/
public class MetaData extends JSONObject {

private static final Logger LOG =
Expand Down Expand Up @@ -67,6 +76,18 @@ public int getInt(String key) {
}
}

@Override
public int optInt(String key, int defaultValue) {
if (has(key)) {
try {
return super.getInt(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
}
}
return defaultValue;
}

@Override
public long getLong(String key) {
try {
Expand All @@ -77,6 +98,18 @@ public long getLong(String key) {
}
}

@Override
public long optLong(String key, long defaultValue) {
if (has(key)) {
try {
return super.getLong(key);
} catch(JSONException e) {
LOG.severe(e.getMessage());
}
}
return defaultValue;
}

@Override
public String getString(String key) {
try {
Expand All @@ -102,9 +135,37 @@ public void setTopMetaData(MetaData topMetaData) {
this.topMetaData = topMetaData;
}

@Override
public JSONObject put(String name, boolean value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String name, double value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String name, int value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String name, long value) throws JSONException {
return super.accumulate(name, value);
}

@Override
public JSONObject put(String key, Object value) {
if (has(key)) {
return super.accumulate(key, value);
}
return super.put(key, value);
}

public JSONObject putString(String key, String val) {
try {
return super.put(key,val);
return super.accumulate(key,val);
} catch(JSONException e) {
LOG.severe(e.getMessage());
return null;
Expand All @@ -113,7 +174,7 @@ public JSONObject putString(String key, String val) {

public JSONObject putLong(String key, long val) {
try {
return super.put(key,String.valueOf(val));
return super.accumulate(key,String.valueOf(val));
} catch(JSONException e) {
LOG.severe(e.getMessage());
return null;
Expand All @@ -122,7 +183,7 @@ public JSONObject putLong(String key, long val) {

public JSONObject putBoolean(String key, boolean val) {
try {
return super.put(key,val);
return super.accumulate(key,val);
} catch(JSONException e) {
LOG.severe(e.getMessage());
return null;
Expand Down
186 changes: 186 additions & 0 deletions src/test/java/org/archive/resource/MetaDataTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package org.archive.resource;

import java.io.IOException;

import org.archive.extract.ExtractingResourceFactoryMapper;
import org.archive.extract.ExtractingResourceProducer;
import org.archive.extract.ProducerUtils;
import org.archive.extract.ResourceFactoryMapper;
import org.archive.format.json.JSONUtils;

import com.github.openjson.JSONArray;
import com.github.openjson.JSONObject;

import junit.framework.TestCase;

public class MetaDataTest extends TestCase {

private static String[] testFilePaths = {
"src/test/resources/org/archive/format/warc/IAH-urls-wget.warc",
"src/test/resources/org/archive/format/warc/mutliple-headers.warc"
};

private static JSONObject obj = new JSONObject("{\"foo\":\"bar\",\"hello\":\"world\"}");

private MetaData putMetaData(MetaData m) {
m.putBoolean("boolean-1", false);
m.putBoolean("boolean-2", true);
m.put("boolean-3", true);
m.put("boolean-1", true); // append

m.put("double-1", 0.5d);
m.put("double-2", 2.5d);
m.put("double-3", 3.5d);
m.put("double-1", 1.5d); // append

m.put("int-1", 0);
m.put("int-2", 2);
m.put("int-3", 3);
m.put("int-1", 1); // append

// choose JSON "numbers" which are forced into a Java long (too big for an integer)
m.putLong("long-1", 0xffffffffL + 0L);
m.putLong("long-2", 0xffffffffL + 2L);
m.put("long-3", 0xffffffffL + 3L);
m.put("long-1", 0xffffffffL + 1L); // append

m.putString("string-1", "0");
m.putString("string-2", "2");
m.put("string-3", "3");
m.put("string-1", "1"); // append

m.putOpt("obj-1", obj);
m.put("obj-1", obj); // append
m.put("obj-2", obj);
m.putOpt("obj-2", null); // do nothing because value is null

return m;
}

private void verifyMultiValuedMetaData(MetaData m) {
// boolean
assertEquals(JSONArray.class, m.get("boolean-1").getClass());
assertEquals(false, ((JSONArray) m.get("boolean-1")).getBoolean(0));
assertEquals(true, ((JSONArray) m.get("boolean-1")).getBoolean(1));
assertEquals(true, m.getBoolean("boolean-2"));
assertEquals(true, m.getBoolean("boolean-3"));
assertEquals(Boolean.class, m.get("boolean-3").getClass());
assertEquals(true, m.optBoolean("boolean-3", false));
assertEquals(false, m.optBoolean("boolean-99", false));

// double
assertEquals(JSONArray.class, m.get("double-1").getClass());
assertEquals(0.5d, ((JSONArray) m.get("double-1")).getDouble(0));
assertEquals(1.5d, ((JSONArray) m.get("double-1")).getDouble(1));
assertEquals(2.5d, m.getDouble("double-2"));
assertEquals(3.5d, m.getDouble("double-3"));
assertEquals(Double.class, m.get("double-3").getClass());
assertEquals(3.5d, m.optDouble("double-3"));
assertEquals(99.5d, m.optDouble("double-99", 99.5d));

// int
assertEquals(JSONArray.class, m.get("int-1").getClass());
assertEquals(0, ((JSONArray) m.get("int-1")).getInt(0));
assertEquals(1, ((JSONArray) m.get("int-1")).getInt(1));
assertEquals(2, m.getInt("int-2"));
assertEquals(3, m.getInt("int-3"));
assertEquals(Integer.class, m.get("int-3").getClass());
assertEquals(3, m.optInt("int-3"));
assertEquals(99, m.optInt("int-99", 99));

// long
assertEquals(JSONArray.class, m.get("long-1").getClass());
assertEquals(0xffffffffL + 0L, ((JSONArray) m.get("long-1")).getLong(0));
assertEquals(0xffffffffL + 1L, ((JSONArray) m.get("long-1")).getLong(1));
assertEquals(0xffffffffL + 2L, m.getLong("long-2"));
assertEquals(0xffffffffL + 3L, m.getLong("long-3"));
assertEquals(Long.class, m.get("long-3").getClass());
assertEquals(0xffffffffL + 3L, m.optLong("long-3"));
assertEquals(0xffffffffL + 99L, m.optLong("long-99", 0xffffffffL + 99L));

// String
assertEquals(JSONArray.class, m.get("string-1").getClass());
assertEquals("0", ((JSONArray) m.get("string-1")).getString(0));
assertEquals("1", ((JSONArray) m.get("string-1")).getString(1));
assertEquals("2", m.getString("string-2"));
assertEquals("3", m.getString("string-3"));
assertEquals(String.class, m.get("string-3").getClass());
assertEquals("3", m.optString("string-3"));
assertEquals("99", m.optString("string-99", "99"));

// Object
assertEquals(JSONArray.class, m.get("obj-1").getClass());
assertEquals(JSONObject.class, ((JSONArray) m.get("obj-1")).get(0).getClass());
assertEquals(JSONObject.class, ((JSONArray) m.get("obj-1")).get(1).getClass());
assertEquals("bar", ((JSONObject) ((JSONArray) m.get("obj-1")).get(0)).get("foo"));
assertEquals("world", ((JSONObject) ((JSONArray) m.get("obj-1")).get(0)).get("hello"));
assertEquals("bar", ((JSONObject) ((JSONArray) m.get("obj-1")).get(1)).get("foo"));
assertEquals("world", ((JSONObject) ((JSONArray) m.get("obj-1")).get(1)).get("hello"));
assertEquals(JSONObject.class, m.get("obj-2").getClass());
assertEquals("bar", ((JSONObject) m.get("obj-2")).get("foo"));
assertEquals("world", ((JSONObject) m.get("obj-2")).get("hello"));
}

public void testMultiValued() {
MetaData m = new MetaData();
m = putMetaData(m);
verifyMultiValuedMetaData(m);

// test (de)serialization
m = new MetaData(m.toString(2));
verifyMultiValuedMetaData(m);
}

private MetaData readNextWARCResponseAsMetaData(String filePath) throws IOException, ResourceParseException {
ResourceProducer producer = ProducerUtils.getProducer(filePath);
ResourceFactoryMapper mapper = new ExtractingResourceFactoryMapper();
ExtractingResourceProducer exProducer = new ExtractingResourceProducer(producer, mapper);
Resource r = exProducer.getNext();
while (r != null) {
MetaData top = r.getMetaData().getTopMetaData();
JSONObject warcHeaders = JSONUtils.extractObject(top, "Envelope.WARC-Header-Metadata");
if (warcHeaders.has("WARC-Type") && "response".equals(warcHeaders.getString("WARC-Type"))) {
return top;
}
r = exProducer.getNext();
}
return null;
}

/**
* Verify that in the legacy test file all WARC and HTTP headers are
* single-valued, i.e. {@linkplain String}s.
*/
public void testSingleHeaders() throws IOException, ResourceParseException {
MetaData m = readNextWARCResponseAsMetaData(testFilePaths[0]);

JSONObject warcHeaders = JSONUtils.extractObject(m, "Envelope.WARC-Header-Metadata");
JSONObject httpHeaders = JSONUtils.extractObject(m, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers");

for (Object header : warcHeaders.keySet()) {
assertEquals(String.class, warcHeaders.get(header.toString()).getClass());
}

for (Object header : httpHeaders.keySet()) {
assertEquals(String.class, httpHeaders.get(header.toString()).getClass());
}
}

public void testMultipleHeaders() throws IOException, ResourceParseException {
MetaData m = readNextWARCResponseAsMetaData(testFilePaths[1]);

JSONObject warcHeaders = JSONUtils.extractObject(m, "Envelope.WARC-Header-Metadata");
JSONObject httpHeaders = JSONUtils.extractObject(m, "Envelope.Payload-Metadata.HTTP-Response-Metadata.Headers");

assertEquals("https://www.example.com/index.html/", warcHeaders.getString("WARC-Target-URI"));
assertEquals(JSONArray.class, warcHeaders.get("WARC-Protocol").getClass());
assertEquals(2, ((JSONArray) warcHeaders.get("WARC-Protocol")).length());
assertEquals("h2", ((JSONArray) warcHeaders.get("WARC-Protocol")).get(0));

assertEquals("108", httpHeaders.getString("Content-Length"));
assertEquals(JSONArray.class, httpHeaders.get("x-powered-by").getClass());
assertEquals(2, ((JSONArray) httpHeaders.get("x-powered-by")).length());
assertEquals("PHP/8.3.11", ((JSONArray) httpHeaders.get("x-powered-by")).get(0));
assertEquals("PleskLin", ((JSONArray) httpHeaders.get("x-powered-by")).get(1));
}
}
47 changes: 47 additions & 0 deletions src/test/resources/org/archive/format/warc/mutliple-headers.warc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
WARC/1.0
WARC-Type: response
WARC-Date: 2024-09-27T10:47:02Z
WARC-Record-ID: <urn:uuid:7a10b628-4d3b-6f2e-8b73-c65d80646310>
Content-Length: 971
Content-Type: application/http; msgtype=response
WARC-Warcinfo-ID: <urn:uuid:824d10d3-4f67-131a-9cbf-e40ecb5f0fa5>
WARC-Concurrent-To: <urn:uuid:51776b84-429e-53cb-a335-b53cf855c57a>
WARC-IP-Address: 172.67.184.105
WARC-Target-URI: https://www.example.com/index.html/
WARC-Protocol: h2
WARC-Protocol: tls/1.3
WARC-Cipher-Suite: TLS_AES_256_GCM_SHA384
WARC-Payload-Digest: sha1:70FB81039DCE25916E0E0CB48CF6662E3F27FFFC
WARC-Block-Digest: sha1:80573371A8271BE6B3AA26FD9DB72E9AD9F316D9
WARC-Identified-Payload-Type: text/html

HTTP/1.1 200
date: Fri, 27 Sep 2024 10:47:02 GMT
content-type: text/html; charset=UTF-8
x-powered-by: PHP/8.3.11
x-powered-by: PleskLin
x-pingback: https://www.example.com/xmlrpc.php
link: <https://www.example.com/wp-json/>; rel="https://api.w.org/"
link: <https://www.example.com/wp-json/wp/v2/posts/00000>; rel="alternate"; title="JSON"; type="application/json"
link: <https://www.example.com/?p=00000>; rel=shortlink
x-litespeed-cache: miss
vary: Accept-Encoding
x-turbo-charged-by: LiteSpeed
cf-cache-status: DYNAMIC
report-to: {"endpoints":[{"url":"https:\/\/a.nel.cloudflare.com\/report\/v4?s=XXtestYY"}],"group":"cf-nel","max_age":604800}
nel: {"success_fraction":0,"report_to":"cf-nel","max_age":604800}
server: cloudflare
cf-ray: 8bf61e4afb9e7f9e-IAD
X-Crawler-content-encoding: br
alt-svc: h3=":443"; ma=86400
Content-Length: 108

<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Test</title>
</head>
<body/>
</html>

Loading