Skip to content

Parse And Create Web ARChive (WARC) files with node.js

License

Notifications You must be signed in to change notification settings

context-labs/node-warc

 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

node-warc

Parse Web Archive (WARC) files or create WARC files using

Run npm install node-warc or yarn add node-warc to ge started

npm Package

Documentation

Full documentation available at n0tan3rd.github.io/node-warc

Parsing

Using async iteration

Requires node 10 or greater

const fs = require('fs')
const zlib = require('zlib')
// recordIterator only exported if async iteration on readable streams is available
const { recordIterator } = require('node-warc')

async function iterateRecords (warcStream) {
  for await (const record of recordIterator(warcStream)) {
    console.log(record)
  }
}

iterateRecords(
  fs.createReadStream('<path-to-gzipd-warcfile>').pipe(zlib.createGunzip())
).then(() => {
  console.log('done')
})

Or using one of the parsers

for await (const record of new AutoWARCParser('<path-to-warcfile>')) {
    console.log(record)
}

Using Stream Transform

const fs = require('fs')
const { WARCStreamTransform } = require('node-warc')

fs
  .createReadStream('<path-to-warcfile>')
  .pipe(new WARCStreamTransform())
  .on('data', record => {
    console.log(record)
  })

Both .warc and .warc.gz

const { AutoWARCParser } = require('node-warc')

const parser = new AutoWARCParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

Only gzip'd warc files

const { WARCGzParser } = require('node-warc')

const parser = new WARCGzParser('<path-to-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

Only non gzip'd warc files

const { WARCGzParser } = require('node-warc')

const parser = new WARCParser('<path-to-gzipd-warcfile>')
parser.on('record', record => { console.log(record) })
parser.on('done', () => { console.log('finished') })
parser.on('error', error => { console.error(error) })
parser.start()

WARC Creation

Environment

  • NODEWARC_WRITE_GZIPPED - enable writing gzipped records to WARC outputs.

Examples

const CRI = require('chrome-remote-interface')
const { RemoteChromeWARCWriter, RemoteChromeCapturer } = require('node-warc')

;(async () => {
  const client = await CRI()
  await Promise.all([
    client.Page.enable(),
    client.Network.enable(),
  ])
  const cap = new RemoteChromeCapturer(client.Network)
  cap.startCapturing()
  await client.Page.navigate({ url: 'http://example.com' });
  // actual code should wait for a better stopping condition, eg. network idle
  await client.Page.loadEventFired()
  const warcGen = new RemoteChromeWARCWriter()
  await warcGen.generateWARC(cap, client.Network, {
    warcOpts: {
      warcPath: 'myWARC.warc'
    },
    winfo: {
      description: 'I created a warc!',
      isPartOf: 'My awesome pywb collection'
    }
  })
  await client.close()
})()
const { CRIExtra, Events, Page } = require('chrome-remote-interface-extra')
const { CRIExtraWARCGenerator, CRIExtraCapturer } = require('node-warc')

;(async () => {
  let client
  try {
    // connect to endpoint
    client = await CRIExtra({ host: 'localhost', port: 9222 })
    const page = await Page.create(client)
    const cap = new CRIExtraCapturer(page, Events.Page.Request)
    cap.startCapturing()
    await page.goto('https://example.com', { waitUntil: 'networkIdle' })
    const warcGen = new CRIExtraWARCGenerator()
    await warcGen.generateWARC(cap, {
      warcOpts: {
        warcPath: 'myWARC.warc'
      },
      winfo: {
        description: 'I created a warc!',
        isPartOf: 'My awesome pywb collection'
      }
    })
  } catch (err) {
    console.error(err)
  } finally {
    if (client) {
      await client.close()
    }
  }
})()

Using Puppeteer

const puppeteer = require('puppeteer')
const { Events } = require('puppeteer')
const { PuppeteerWARCGenerator, PuppeteerCapturer } = require('node-warc')

;(async () => {
  const browser = await puppeteer.launch()
  const page = await browser.newPage()
  const cap = new PuppeteerCapturer(page, Events.Page.Request)
  cap.startCapturing()
  await page.goto('http://example.com', { waitUntil: 'networkidle0' })
  const warcGen = new PuppeteerWARCGenerator()
  await warcGen.generateWARC(cap, {
    warcOpts: {
      warcPath: 'myWARC.warc'
    },
    winfo: {
      description: 'I created a warc!',
      isPartOf: 'My awesome pywb collection'
    }
  })
  await page.close()
  await browser.close()
})()

Note

The generateWARC method used in the preceding examples is helper function for making the WARC generation process simple. See its implementation for a full example of WARC generation using node-warc

Or see one of the crawler implementations provided by Squidwarc.

About

Parse And Create Web ARChive (WARC) files with node.js

Resources

License

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages

  • JavaScript 100.0%