From 219b5969756e0a8222e669ffdda449f4b6d50fd7 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 23 Feb 2023 16:16:40 -0500 Subject: [PATCH 1/3] Add -l/--log-directory option to add logs directory to WACZ This commit additional formats a few modules with Black to prevent the linter from complaining. --- tests/fixtures/logs/wr-specs-crawl.log | 63 ++++++++++++++++++++++++++ tests/test_create_wacz.py | 23 +++++++++- wacz/main.py | 20 ++++++++ wacz/waczindexer.py | 1 - 4 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/logs/wr-specs-crawl.log diff --git a/tests/fixtures/logs/wr-specs-crawl.log b/tests/fixtures/logs/wr-specs-crawl.log new file mode 100644 index 0000000..41af690 --- /dev/null +++ b/tests/fixtures/logs/wr-specs-crawl.log @@ -0,0 +1,63 @@ +{"logLevel":"info","timestamp":"2023-02-23T20:29:36.908Z","context":"general","message":"Seeds","details":[{"url":"https://specs.webrecorder.net/","include":[{}],"exclude":[],"scopeType":"prefix","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]} +{"logLevel":"info","timestamp":"2023-02-23T20:29:37.197Z","context":"state","message":"Storing state in memory","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:37.572Z","context":"general","message":"Text Extraction: Disabled","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:45.587Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:45.590Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:45.591Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:45.591Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/"}} +{"logLevel":"warn","timestamp":"2023-02-23T20:29:46.083Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async ElementHandle.evaluateHandle (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/JSHandle.js:94:16)\n at async internalHandler.queryOne (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/QueryHandler.js:25:30)\n at async ElementHandle.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ElementHandle.js:78:17)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)"}} +{"logLevel":"warn","timestamp":"2023-02-23T20:29:46.301Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async ElementHandle.evaluateHandle (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/JSHandle.js:94:16)\n at async internalHandler.queryOne (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/QueryHandler.js:25:30)\n at async ElementHandle.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ElementHandle.js:78:17)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)"}} +{"logLevel":"warn","timestamp":"2023-02-23T20:29:46.309Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async IsolatedWorld.document (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:186:63)\n at async IsolatedWorld.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:174:26)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)\n at async /app/node_modules/puppeteer-cluster/dist/util.js:63:24"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:46.594Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/","seedId":0,"depth":0,"started":"2023-02-23T20:29:37.646Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:46.594Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":0,"total":5,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/\",\"seedId\":0,\"depth\":0,\"started\":\"2023-02-23T20:29:37.646Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.593Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}"]}} +{"logLevel":"warn","timestamp":"2023-02-23T20:29:46.821Z","context":"general","message":"Check CF failed, ignoring","details":{"type":"exception","message":"Execution context was destroyed, most likely because of a navigation.","stack":"Error: Execution context was destroyed, most likely because of a navigation.\n at rewriteError (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:276:15)\n at process.processTicksAndRejections (node:internal/process/task_queues:95:5)\n at async ExecutionContext._ExecutionContext_evaluate (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/ExecutionContext.js:222:56)\n at async IsolatedWorld.document (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:186:63)\n at async IsolatedWorld.$ (file:///app/node_modules/puppeteer-core/lib/esm/puppeteer/common/IsolatedWorld.js:174:26)\n at async Crawler.checkCF (file:///app/crawler.js:968:14)\n at async Crawler.loadPage (file:///app/crawler.js:869:5)\n at async Crawler.default [as driver] (file:///app/defaultDriver.js:3:3)\n at async Crawler.crawlPage (file:///app/crawler.js:384:7)\n at async /app/node_modules/puppeteer-cluster/dist/util.js:63:24"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.104Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.108Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.110Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.111Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.600Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.606Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.607Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.608Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.675Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.678Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.678Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.680Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.905Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.909Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.909Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:52.911Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.113Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/use-cases/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.593Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.115Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":1,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/use-cases/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.593Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.611Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz-auth/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.633Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.611Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":2,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.633Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.680Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:45.613Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.680Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":3,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:45.613Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.914Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/cdxj/latest/","seedId":0,"depth":1,"started":"2023-02-23T20:29:46.595Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:53.914Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":4,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/cdxj/latest/\",\"seedId\":0,\"depth\":1,\"started\":\"2023-02-23T20:29:46.595Z\"}","{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.380Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.382Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.784Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.789Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.789Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.790Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.883Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.893Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.893Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:29:59.894Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.090Z","context":"general","message":"Running behaviors...","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.096Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.097Z","context":"behavior","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.098Z","context":"general","message":"Run behaviors finished","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.383Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/use-cases/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.119Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.383Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":5,"total":9,"pending":4,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/use-cases/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.119Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.793Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz-auth/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.612Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.794Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":6,"total":9,"pending":3,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz-auth/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.612Z\"}","{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.896Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/wacz/1.1.1/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.681Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:00.896Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":7,"total":9,"pending":2,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/wacz/1.1.1/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.681Z\"}","{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:01.103Z","context":"pageGraph","message":"Page graph data for successfully crawled page","details":{"url":"https://specs.webrecorder.net/cdxj/0.1.0/","seedId":0,"depth":2,"started":"2023-02-23T20:29:53.915Z"}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:01.107Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":8,"total":9,"pending":1,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://specs.webrecorder.net/cdxj/0.1.0/\",\"seedId\":0,\"depth\":2,\"started\":\"2023-02-23T20:29:53.915Z\"}"]}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:01.265Z","context":"general","message":"Waiting to ensure pending data is written to WARCs...","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T20:30:01.277Z","context":"crawlState","message":"Crawl statistics","details":{"crawled":9,"total":9,"pending":0,"limit":{"max":0,"hit":false},"pendingPages":[]}} diff --git a/tests/test_create_wacz.py b/tests/test_create_wacz.py index 6d15e6c..172d7f1 100644 --- a/tests/test_create_wacz.py +++ b/tests/test_create_wacz.py @@ -32,6 +32,8 @@ def setUpClass(self, mock_now): os.path.join(TEST_DIR, "example-collection.warc"), "-o", os.path.join(self.tmpdir.name, "valid_example_1.wacz"), + "-l", + os.path.join(TEST_DIR, "logs"), ] ) with zipfile.ZipFile( @@ -59,6 +61,9 @@ def setUpClass(self, mock_now): self.tmpdir.name, "unzipped_wacz_1/datapackage.json", ) + self.wacz_log = os.path.join( + self.tmpdir.name, "unzipped_wacz_1/logs/wr-specs-crawl.log" + ) def test_components(self): """Check that the basic components of a wacz file exist""" @@ -117,12 +122,23 @@ def test_cdx_structure(self): 'com,example)/ 20201007212236 {"url": "http://www.example.com/", "mime": "text/html", "status": "200", "digest": "sha1:WJM2KPM4GF3QK2BISVUH2ASX64NOUY7L", "length": "1293", "offset": "845", "filename": "example-collection.warc", "recordDigest": "sha256:f78838ace891c96f7a6299e9e085b55a5aba8950a6d77f0f2e9ffe90f63255f2"}\n', ) + def test_log(self): + with open(self.wacz_log, "rb") as f: + content = f.read() + f.close() + + self.assertTrue( + content.startswith( + b'{"logLevel":"info","timestamp":"2023-02-23T20:29:36.908Z","context":"general","message":"Seeds","details":[{"url":"https://specs.webrecorder.net/","include":[{}],"exclude":[],"scopeType":"prefix","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]}\n', + ) + ) + def test_data_package_structure(self): """Check that the package_descriptor is valid""" f = open(self.wacz_json, "rb") json_parse = json.loads(f.read()) # Make sure it's recording the correct number of resources - self.assertEqual(len(json_parse["resources"]), 4) + self.assertEqual(len(json_parse["resources"]), 5) # Check that the correct hash was recorded for a warc original_warc = hash_file("sha256", self.warc_file) @@ -142,6 +158,11 @@ def test_data_package_structure(self): cdx_resource = self.find_resource(json_parse["resources"], "cdx") self.assertEqual(original_wacz_index_cdx, cdx_resource["hash"]) + # Check that the correct hash was recorded for the log file + original_wacz_log = hash_file("sha256", self.wacz_log) + log_resource = self.find_resource(json_parse["resources"], "wr-specs-crawl.log") + self.assertEqual(original_wacz_log, log_resource["hash"]) + # Use frictionless validation valid = validate(self.wacz_json) self.assertTrue(valid.valid) diff --git a/wacz/main.py b/wacz/main.py index 92220c9..2704086 100644 --- a/wacz/main.py +++ b/wacz/main.py @@ -65,6 +65,13 @@ def main(args=None): help="Allows the user to specify the hash type used. Currently we allow sha256 and md5", ) + create.add_argument( + "-l", + "--log-directory", + help="Adds log files in specified directory to WACZ", + action="store", + ) + create.add_argument("--split-seeds", action="store_true") create.add_argument("--ts") @@ -247,6 +254,19 @@ def create_wacz(res): "Invalid passed page. We were unable to find a match for %s" % str(key) ) + if res.log_directory: + print("Writing logs...") + log_dir = os.path.abspath(res.log_directory) + for log_file in os.listdir(log_dir): + log_path = os.path.join(log_dir, log_file) + log_wacz_file = zipfile.ZipInfo.from_file( + log_path, "logs/{}".format(log_file) + ) + with wacz.open(log_wacz_file, "w") as out_fh: + with open(log_path, "rb") as in_fh: + shutil.copyfileobj(in_fh, out_fh) + path = "logs/{}".format(log_file) + if len(wacz_indexer.pages) > 0 and res.pages == None: print("Generating page index...") # generate pages/text diff --git a/wacz/waczindexer.py b/wacz/waczindexer.py index 98d2eac..25b2478 100644 --- a/wacz/waczindexer.py +++ b/wacz/waczindexer.py @@ -264,7 +264,6 @@ def check_pages_and_text(self, record): return if id_ not in self.pages: - if self.detect_pages: self.pages[id_] = {"timestamp": ts, "url": url, "title": url} else: From e0d85713fe63535132bf692b3033942b21aff694 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 23 Feb 2023 18:50:52 -0500 Subject: [PATCH 2/3] Add second log file fixture --- tests/fixtures/logs/wr-crawl.log | 17 +++++++++++++++++ tests/test_create_wacz.py | 20 ++++++++++++++++++-- 2 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 tests/fixtures/logs/wr-crawl.log diff --git a/tests/fixtures/logs/wr-crawl.log b/tests/fixtures/logs/wr-crawl.log new file mode 100644 index 0000000..8ece500 --- /dev/null +++ b/tests/fixtures/logs/wr-crawl.log @@ -0,0 +1,17 @@ +{"logLevel":"info","timestamp":"2023-02-23T23:44:39.665Z","context":"general","message":"Page context being used with 1 worker","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:39.666Z","context":"general","message":"Set netIdleWait to 15 seconds","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:39.666Z","context":"general","message":"Seeds","details":[{"url":"https://webrecorder.net/","include":[],"exclude":[],"scopeType":"page","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]} +{"logLevel":"info","timestamp":"2023-02-23T23:44:40.016Z","context":"state","message":"Storing state in memory","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:40.473Z","context":"general","message":"Text Extraction: Disabled","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:40.590Z","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":0,"total":1,"pending":1,"limit":{"max":0,"hit":false},"pendingPages":["{\"url\":\"https://webrecorder.net/\",\"seedId\":0,\"depth\":0,\"started\":\"2023-02-23T23:44:40.517Z\"}"]}} +{"logLevel":"error","timestamp":"2023-02-23T23:44:43.279Z","context":"general","message":"Invalid Seed \"mailto:info@webrecorder.net\" - URL must start with http:// or https://","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.286Z","context":"behavior","message":"Behaviors started","details":{"behaviorTimeout":90,"page":"https://webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.287Z","context":"behavior","message":"Run Script Started","details":{"url":"https://webrecorder.net/","page":"https://webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.291Z","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"Skipping autoscroll, page seems to not be responsive to scrolling events","page":"https://webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.291Z","context":"behaviorScript","message":"Behavior log","details":{"state":{"segments":1},"msg":"done!","page":"https://webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"behavior","message":"Run Script Finished","details":{"url":"https://webrecorder.net/","page":"https://webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"behavior","message":"Behaviors finished","details":{"finished":1,"page":"https://webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.293Z","context":"pageStatus","message":"Page finished","details":{"page":"https://webrecorder.net/"}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.358Z","context":"crawlStatus","message":"Crawl statistics","details":{"crawled":1,"total":1,"pending":0,"limit":{"max":0,"hit":false},"pendingPages":[]}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.358Z","context":"general","message":"Waiting to ensure pending data is written to WARCs...","details":{}} +{"logLevel":"info","timestamp":"2023-02-23T23:44:43.364Z","context":"general","message":"Crawl status: done","details":{}} diff --git a/tests/test_create_wacz.py b/tests/test_create_wacz.py index 172d7f1..95f9569 100644 --- a/tests/test_create_wacz.py +++ b/tests/test_create_wacz.py @@ -64,6 +64,9 @@ def setUpClass(self, mock_now): self.wacz_log = os.path.join( self.tmpdir.name, "unzipped_wacz_1/logs/wr-specs-crawl.log" ) + self.wacz_second_log = os.path.join( + self.tmpdir.name, "unzipped_wacz_1/logs/wr-crawl.log" + ) def test_components(self): """Check that the basic components of a wacz file exist""" @@ -122,16 +125,25 @@ def test_cdx_structure(self): 'com,example)/ 20201007212236 {"url": "http://www.example.com/", "mime": "text/html", "status": "200", "digest": "sha1:WJM2KPM4GF3QK2BISVUH2ASX64NOUY7L", "length": "1293", "offset": "845", "filename": "example-collection.warc", "recordDigest": "sha256:f78838ace891c96f7a6299e9e085b55a5aba8950a6d77f0f2e9ffe90f63255f2"}\n', ) - def test_log(self): + def test_logs(self): with open(self.wacz_log, "rb") as f: content = f.read() f.close() + with open(self.wacz_second_log, "rb") as f: + second_content = f.read() + f.close() + self.assertTrue( content.startswith( b'{"logLevel":"info","timestamp":"2023-02-23T20:29:36.908Z","context":"general","message":"Seeds","details":[{"url":"https://specs.webrecorder.net/","include":[{}],"exclude":[],"scopeType":"prefix","sitemap":false,"allowHash":false,"maxExtraHops":0,"maxDepth":99999}]}\n', ) ) + self.assertTrue( + content.startswith( + b'{"logLevel":"info","timestamp":"2023-02-23T23:44:39.665Z","context":"general","message":"Page context being used with 1 worker","details":{}}\n' + ) + ) def test_data_package_structure(self): """Check that the package_descriptor is valid""" @@ -158,11 +170,15 @@ def test_data_package_structure(self): cdx_resource = self.find_resource(json_parse["resources"], "cdx") self.assertEqual(original_wacz_index_cdx, cdx_resource["hash"]) - # Check that the correct hash was recorded for the log file + # Check that the correct hash was recorded for the log files original_wacz_log = hash_file("sha256", self.wacz_log) log_resource = self.find_resource(json_parse["resources"], "wr-specs-crawl.log") self.assertEqual(original_wacz_log, log_resource["hash"]) + second_wacz_log = hash_file("sha256", self.wacz_second_log) + log_resource = self.find_resource(json_parse["resources"], "wr-crawl.log") + self.assertEqual(second_wacz_log, log_resource["hash"]) + # Use frictionless validation valid = validate(self.wacz_json) self.assertTrue(valid.valid) From 23e11a407e2472f766579316a457931cf0d96dfc Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 23 Feb 2023 18:54:17 -0500 Subject: [PATCH 3/3] Fix tests --- tests/test_create_wacz.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_create_wacz.py b/tests/test_create_wacz.py index 95f9569..7095c03 100644 --- a/tests/test_create_wacz.py +++ b/tests/test_create_wacz.py @@ -140,7 +140,7 @@ def test_logs(self): ) ) self.assertTrue( - content.startswith( + second_content.startswith( b'{"logLevel":"info","timestamp":"2023-02-23T23:44:39.665Z","context":"general","message":"Page context being used with 1 worker","details":{}}\n' ) ) @@ -150,7 +150,7 @@ def test_data_package_structure(self): f = open(self.wacz_json, "rb") json_parse = json.loads(f.read()) # Make sure it's recording the correct number of resources - self.assertEqual(len(json_parse["resources"]), 5) + self.assertEqual(len(json_parse["resources"]), 6) # Check that the correct hash was recorded for a warc original_warc = hash_file("sha256", self.warc_file)