node-warc

Parse Web Archive (WARC) files or create WARC files using

Run npm install node-warc or yarn add node-warc to ge started

Documentation

Full documentation available at n0tan3rd.github.io/node-warc

Parsing

Using async iteration

Requires node 10 or greater

const fs = require('fs')
const zlib = require('zlib')
// recordIterator only exported if async iteration on readable streams is available
const { recordIterator } = require('node-warc')

async function iterateRecords (warcStream) {
  for await (const record of recordIterator(warcStream)) {
    console.log(record)
  }
}

iterateRecords(
  fs.createReadStream('<path-to-gzipd-warcfile>').pipe(zlib.createGunzip())
).then(() => {
  console.log('done')
})

Or using one of the parsers

for await (const record of new AutoWARCParser('<path-to-warcfile>')) {
    console.log(record)
}

Using Stream Transform

const fs = require('fs')
const { WARCStreamTransform } = require('node-warc')

fs
  .createReadStream('<path-to-warcfile>')
  .pipe(new WARCStreamTransform())
  .on('data', record => {
    console.log(record)
  })

Both `.warc` and `.warc.gz`

const { AutoWARCParser } = require('node-warc')

const parser = new AutoWARCParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

Only gzip'd warc files

const { WARCGzParser } = require('node-warc')

const parser = new WARCGzParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

Only non gzip'd warc files

const { WARCGzParser } = require('node-warc')

const parser = new WARCParser('<path-to-gzipd-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

WARC Creation

Environment

NODEWARC_WRITE_GZIPPED - enable writing gzipped records to WARC outputs.

Examples

Using chrome-remote-interface

const CRI = require('chrome-remote-interface')
const { RemoteChromeWARCWriter, RemoteChromeCapturer } = require('node-warc')

;(async () => {
  const client = await CRI()
  await Promise.all([
    client.Page.enable(),
    client.Network.enable(),
  ])
  const cap = new RemoteChromeCapturer(client.Network)
  cap.startCapturing()
  await client.Page.navigate({ url: 'http://example.com' });
  // actual code should wait for a better stopping condition, eg. network idle
  await client.Page.loadEventFired()
  const warcGen = new RemoteChromeWARCWriter()
  await warcGen.generateWARC(cap, client.Network, {
    warcOpts: {
      warcPath: 'myWARC.warc'
    },
    winfo: {
      description: 'I created a warc!',
      isPartOf: 'My awesome pywb collection'
    }
  })
  await client.close()
})()

Using chrome-remote-interface-extra

const { CRIExtra, Events, Page } = require('chrome-remote-interface-extra')
const { CRIExtraWARCGenerator, CRIExtraCapturer } = require('node-warc')

;(async () => {
  let client
  try {
    // connect to endpoint
    client = await CRIExtra({ host: 'localhost', port: 9222 })
    const page = await Page.create(client)
    const cap = new CRIExtraCapturer(page, Events.Page.Request)
    cap.startCapturing()
    await page.goto('https://example.com', { waitUntil: 'networkIdle' })
    const warcGen = new CRIExtraWARCGenerator()
    await warcGen.generateWARC(cap, {
      warcOpts: {
        warcPath: 'myWARC.warc'
      },
      winfo: {
        description: 'I created a warc!',
        isPartOf: 'My awesome pywb collection'
      }
    })
  } catch (err) {
    console.error(err)
  } finally {
    if (client) {
      await client.close()
    }
  }
})()

Using Puppeteer

const puppeteer = require('puppeteer')
const { Events } = require('puppeteer')
const { PuppeteerWARCGenerator, PuppeteerCapturer } = require('node-warc')

;(async () => {
  const browser = await puppeteer.launch()
  const page = await browser.newPage()
  const cap = new PuppeteerCapturer(page, Events.Page.Request)
  cap.startCapturing()
  await page.goto('http://example.com', { waitUntil: 'networkidle0' })
  const warcGen = new PuppeteerWARCGenerator()
  await warcGen.generateWARC(cap, {
    warcOpts: {
      warcPath: 'myWARC.warc'
    },
    winfo: {
      description: 'I created a warc!',
      isPartOf: 'My awesome pywb collection'
    }
  })
  await page.close()
  await browser.close()
})()

Note

The generateWARC method used in the preceding examples is helper function for making the WARC generation process simple. See its implementation for a full example of WARC generation using node-warc

Or see one of the crawler implementations provided by Squidwarc.

Name		Name	Last commit message	Last commit date
Latest commit History 115 Commits
lib		lib
misc		misc
test		test
.esdoc.json		.esdoc.json
.gitignore		.gitignore
.npmignore		.npmignore
.travis.yml		.travis.yml
CHANGELOG.md		CHANGELOG.md
LICENSE		LICENSE
README.md		README.md
index.d.ts		index.d.ts
index.js		index.js
package.json		package.json
yarn.lock		yarn.lock

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

node-warc

Documentation

Parsing

Using async iteration

Using Stream Transform

Both `.warc` and `.warc.gz`

Only gzip'd warc files

Only non gzip'd warc files

WARC Creation

Environment

Examples

Using chrome-remote-interface

Using chrome-remote-interface-extra

Using Puppeteer

Note

About

Releases

Packages

Languages

License

context-labs/node-warc

Folders and files

Latest commit

History

Repository files navigation

node-warc

Documentation

Parsing

Using async iteration

Using Stream Transform

Both .warc and .warc.gz

Only gzip'd warc files

Only non gzip'd warc files

WARC Creation

Environment

Examples

Using chrome-remote-interface

Using chrome-remote-interface-extra

Using Puppeteer

Note

About

Resources

License

Stars

Watchers

Forks

Releases

Packages 0

Languages

Both `.warc` and `.warc.gz`

Packages