Skip to content

Commit

Permalink
more download improvements:
Browse files Browse the repository at this point in the history
- fix missing http protocol in request records
- condense download menu: download only selected if any pages are selected
- set content-length when removing encoding to ensure correct http content-length
- add option to download warc/1.0 warcs
ensure content-length is set to actual content-length after recording
recorder: don't detach if not running
dependency: update to replaywebpage 1.5.1 for improved download options
bump to 0.6.16
  • Loading branch information
ikreymer committed Sep 8, 2021
1 parent 055affa commit f811c65
Show file tree
Hide file tree
Showing 10 changed files with 179 additions and 160 deletions.
6 changes: 3 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "archiveweb.page",
"version": "0.6.15",
"version": "0.6.16",
"main": "index.js",
"description": "Create Web Archives directly in your browser",
"repository": "https://github.com/webrecorder/archiveweb.page",
Expand All @@ -11,7 +11,7 @@
"@webrecorder/wabac": "^2.9.0",
"browsertrix-behaviors": "^0.2.3",
"btoa": "^1.2.1",
"bulma": "^0.9.2",
"bulma": "^0.9.3",
"flexsearch": "^0.6.32",
"hash-wasm": "^4.4.1",
"http-status-codes": "^1.4.0",
Expand All @@ -22,7 +22,7 @@
"lodash": "^4.17.20",
"node-fetch": "^2.6.1",
"pretty-bytes": "^5.3.0",
"replaywebpage": "^1.5.0",
"replaywebpage": "^1.5.1",
"uuid": "^8.3.2",
"warcio": "^1.4.7"
},
Expand Down
39 changes: 22 additions & 17 deletions src/downloader.js
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ class Downloader
this.modifiedDate = coll.config.metadata.mtime ? new Date(coll.config.metadata.mtime).toISOString() : null;

this.format = format;
this.warcVersion = (format === "warc1.0") ? "WARC/1.0" : "WARC/1.1";

this.filename = filename;

Expand Down Expand Up @@ -122,6 +123,7 @@ class Downloader
return this.downloadWACZ(this.filename, sizeCallback);

case "warc":
case "warc1.0":
return this.downloadWARC(this.filename, sizeCallback);

default:
Expand Down Expand Up @@ -557,7 +559,7 @@ class Downloader
}

async createWARCInfo(filename) {
const warcVersion = "WARC/1.1";
const warcVersion = this.warcVersion;
const type = "warcinfo";

const info = {
Expand All @@ -579,21 +581,24 @@ class Downloader
return buffer;
}

removeEncodingHeaders(headersMap) {
fixupHttpHeaders(headersMap, length) {
let count = 0;
for (const [name] of Object.entries(headersMap)) {
const lowerName = name.toLowerCase();
if (lowerName === "content-encoding") {
switch (lowerName) {
case "content-encoding":
case "transfer-encoding":
delete headersMap[name];
if (++count === 2) {
break;
}
++count;
break;

case "content-length":
headersMap[name] = "" + length;
++count;
break;
}
if (lowerName === "transfer-encoding") {
delete headersMap[name];
if (++count === 2) {
break;
}
if (count === 3) {
break;
}
}
}
Expand All @@ -603,10 +608,7 @@ class Downloader
const date = new Date(resource.ts).toISOString();
resource.timestamp = getTSMillis(date);
const httpHeaders = resource.respHeaders || {};
const warcVersion = "WARC/1.1";

// remove aas never preserved in browser-based capture
this.removeEncodingHeaders(httpHeaders);
const warcVersion = this.warcVersion;

const pageId = resource.pageId;

Expand Down Expand Up @@ -693,6 +695,9 @@ class Downloader
warcHeaders["WARC-Payload-Digest"] = resource.digest;
}

// remove encoding, set content-length as encoding never preserved in browser-based capture
this.fixupHttpHeaders(httpHeaders, payload.length);

const record = await WARCRecord.create({
url, date, type, warcVersion, warcHeaders, statusline, httpHeaders,
refersToUrl, refersToDate}, getPayload(payload));
Expand All @@ -716,7 +721,7 @@ class Downloader
};

const urlParsed = new URL(url);
const statusline = method + " " + url.slice(urlParsed.origin.length);
const statusline = `${method} ${url.slice(urlParsed.origin.length)} HTTP/1.1`;

const reqRecord = await WARCRecord.create({
url, date, warcVersion, type,
Expand All @@ -740,7 +745,7 @@ class Downloader

const type = "resource";
const warcHeaders = {"Content-Type": "text/plain; charset=\"UTF-8\""};
const warcVersion = "WARC/1.1";
const warcVersion = this.warcVersion;

const payload = getPayload(encoder.encode(resource.text));

Expand Down
8 changes: 6 additions & 2 deletions src/recorder.js
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ class Recorder {
}

async detach() {
if (!this.running) {
return;
}

this.stopping = true;

const domNodes = await this.getFullText(true);
Expand Down Expand Up @@ -1188,8 +1192,8 @@ class Recorder {

console.log("Start Async Load: " + request.url);

const result = await this.pageEval("__awp_async_fetch__", expression, sessions);
console.log("Async Fetch Result: " + JSON.stringify(result));
await this.pageEval("__awp_async_fetch__", expression, sessions);
//console.log("Async Fetch Result: " + JSON.stringify(result));
}

async doAsyncFetch(request, sessions) {
Expand Down
32 changes: 22 additions & 10 deletions src/requestresponseinfo.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ import { postToGetUrl } from "warcio";
// max URL length for post/put payload-converted URLs
const MAX_URL_LENGTH = 4096;

const CONTENT_LENGTH = "content-length";
const CONTENT_TYPE = "content-type";
const EXCLUDE_HEADERS = ["content-encoding", "transfer-encoding"];

const encoder = new TextEncoder();
Expand Down Expand Up @@ -146,10 +148,10 @@ class RequestResponseInfo

this.ts = new Date().getTime();

const respHeaders = this.getResponseHeadersDict();
const respHeaders = this.getResponseHeadersDict(payload.length);
const reqHeaders = this.getRequestHeadersDict();

const mime = (respHeaders.headers.get("content-type") || "").split(";")[0];
const mime = (respHeaders.headers.get(CONTENT_TYPE) || "").split(";")[0];
const cookie = reqHeaders.headers.get("cookie");

if (cookie) {
Expand Down Expand Up @@ -230,19 +232,24 @@ class RequestResponseInfo
return this._getHeadersDict(this.requestHeaders, null);
}

getResponseHeadersDict() {
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList);
getResponseHeadersDict(length) {
return this._getHeadersDict(this.responseHeaders, this.responseHeadersList, length);
}

_getHeadersDict(headersDict, headersList) {
_getHeadersDict(headersDict, headersList, actualContentLength) {
if (!headersDict && headersList) {
headersDict = {};

for (const header of headersList) {
if (EXCLUDE_HEADERS.includes(header.name.toLowerCase())) {
const headerName = header.name.toLowerCase();
if (EXCLUDE_HEADERS.includes(headerName)) {
continue;
}
headersDict[header.name] = header.value.replace(/\n/g, ", ");
if (actualContentLength && headerName === CONTENT_LENGTH) {
headersDict[headerName] = "" + actualContentLength;
continue;
}
headersDict[headerName] = header.value.replace(/\n/g, ", ");
}
}

Expand All @@ -260,7 +267,12 @@ class RequestResponseInfo
delete headersDict[key];
continue;
}
if (EXCLUDE_HEADERS.includes(key.toLowerCase())) {
const keyLower = key.toLowerCase();
if (EXCLUDE_HEADERS.includes(keyLower)) {
continue;
}
if (actualContentLength && keyLower === CONTENT_LENGTH) {
headersDict[key] = "" + actualContentLength;
continue;
}
headersDict[key] = headersDict[key].replace(/\n/g, ", ");
Expand All @@ -284,8 +296,8 @@ class RequestResponseInfo
const length = this.payload.length;

const { headers } = this.getResponseHeadersDict();
const contentType = headers.get("content-type");
const contentLength = headers.get("content-length");
const contentType = headers.get(CONTENT_TYPE);
const contentLength = headers.get(CONTENT_LENGTH);

if (Number(contentLength) !== length) {
return false;
Expand Down
8 changes: 4 additions & 4 deletions wr-ext/bg.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion wr-ext/manifest.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "Webrecorder ArchiveWeb.page",
"description": "Create high-fidelity web archives directly in your browser",
"version": "0.6.15",
"version": "0.6.16",
"content_security_policy": "script-src 'self' 'unsafe-eval'; object-src 'self'",
"permissions": [
"debugger",
Expand Down
88 changes: 44 additions & 44 deletions wr-ext/popup.js

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions wr-ext/replay/sw.js

Large diffs are not rendered by default.

134 changes: 66 additions & 68 deletions wr-ext/replay/ui.js

Large diffs are not rendered by default.

18 changes: 9 additions & 9 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1997,10 +1997,10 @@ builtin-status-codes@^3.0.0:
resolved "https://registry.yarnpkg.com/builtin-status-codes/-/builtin-status-codes-3.0.0.tgz#85982878e21b98e1c66425e03d0174788f569ee8"
integrity sha1-hZgoeOIbmOHGZCXgPQF0eI9Wnug=

bulma@^0.9.2:
version "0.9.2"
resolved "https://registry.yarnpkg.com/bulma/-/bulma-0.9.2.tgz#340011e119c605f19b8ca886bfea595f1deaf23c"
integrity sha512-e14EF+3VSZ488yL/lJH0tR8mFWiEQVCMi/BQUMi2TGMBOk+zrDg4wryuwm/+dRSHJw0gMawp2tsW7X1JYUCE3A==
bulma@^0.9.3:
version "0.9.3"
resolved "https://registry.yarnpkg.com/bulma/-/bulma-0.9.3.tgz#ddccb7436ebe3e21bf47afe01d3c43a296b70243"
integrity sha512-0d7GNW1PY4ud8TWxdNcP6Cc8Bu7MxcntD/RRLGWuiw/s0a9P+XlH/6QoOIrmbj6o8WWJzJYhytiu9nFjTszk1g==

bytes@3.0.0:
version "3.0.0"
Expand Down Expand Up @@ -9688,14 +9688,14 @@ repeating@^2.0.0:
dependencies:
is-finite "^1.0.0"

replaywebpage@^1.5.0:
version "1.5.0"
resolved "https://registry.yarnpkg.com/replaywebpage/-/replaywebpage-1.5.0.tgz#e2bf21e927a12e0f253073c7da38920ed5c159d5"
integrity sha512-vGhsYWfV55M7JobbvG+O+Uf9/VXkE2nkx9lkvgb4ze53Bl2aMV3yzjsHzsCsEQ6fE/XmBlYTEbIaC090W3lv/A==
replaywebpage@^1.5.1:
version "1.5.1"
resolved "https://registry.yarnpkg.com/replaywebpage/-/replaywebpage-1.5.1.tgz#0f2cf3f3a9b485eb8b112e486c5f6fd9b9853eca"
integrity sha512-etgrBrYH+kXQSEdximKV4/UYQB+23ZXRHi0W1g2d6L/fxHaAqAHSLTzQWpiaAYvyUjGN0H2h+fhQCSOJvQSb9w==
dependencies:
"@fortawesome/fontawesome-free" "^5.13.0"
"@webrecorder/wabac" "^2.9.0-beta.1"
bulma "^0.9.2"
bulma "^0.9.3"
electron-log "^4.3.0"
electron-updater "^4.3.5"
fetch-ndjson "^1.1.0"
Expand Down

0 comments on commit f811c65

Please sign in to comment.