var xray = require('x-ray');
xray('http://github.com/stars/matthewmueller')
.select([{
$root: '.repo-list-item',
title: '.repo-list-name',
link: '.repo-list-name a[href]',
description: '.repo-list-description',
meta: {
$root: '.repo-list-meta',
starredOn: 'time'
}
}])
.paginate('.pagination a:last-child[href]')
.limit(10)
.write('out.json');
npm install x-ray
-
Flexible schema: Supports strings, arrays, arrays of objects, and nested object structures. The schema is not tied to the structure of the page you're scraping, allowing you to pull the data in the structure of your choosing.
-
Pagination support: Paginate through websites, scraping each page. X-ray also supports a request
delay
and a paginationlimit
. Scraped pages can be streamed to a file, so if there's an error on one page, you won't lose what you've already scraped. -
Complex actions: With the PhantomJS driver, you can click on buttons, fill out forms, even login or signup before scraping the page. This allows you to scrape pages that require authentication like Facebook or Twitter.
-
Pluggable drivers: Swap in different scrapers depending on your needs. Currently supports HTTP and PhantomJS driver drivers. In the future, I'd like to see a Tor driver for requesting pages through the Tor network.
-
Adaptive output: Apply custom functions to format your content. This allows you to create RSS feeds or even HTML pages from your output.
Initialize xray
with a url
xray('http://google.com')
The elements you'd like to select. Uses x-ray-select for matching the elements on the page.
You can specify [attr]
to select different attributes. Here are some examples:
img[src]
a[href]
header[class]
div[data-count]
And you can use the $root
attribute to scope the search. Here are some example selections:
xray('http://google.com')
.select('title')
.run(function(err, title) {
// title is 'Google'
});
xray('http://mat.io')
.select(['.Header-list-item a'])
.run(function(err, array) {
// array is [ 'Github', 'Twitter', 'Lapwing', 'Email' ]
});
The following will select the first item:
xray('http://mat.io')
.select({
$root: ".item",
link: 'a[href]',
thumb: 'img[src]',
content: {
$root: '.item-content',
title: 'h2',
body: 'section'
},
tags: ['.item-tags li']
})
.run(function(err, object) {
// object is the first "item":
//
// {
// link: 'http://ift.tt/1xIsboY',
// thumb: 'http://www.google.com/s2/favicons?domain=http://ift.tt/1xIsboY',
// content: {
// title: 'The 100 Best Children\'s Books of All Time',
// body: 'Relive your childhood...'
// },
// tags: [ 'twitter' ]
// }
});
It's easy to grab all the items by passing an array.
xray('http://mat.io')
.select([{
$root: ".item",
link: 'a[href]',
thumb: 'img[src]',
content: {
$root: '.item-content',
title: 'h2',
body: 'section'
},
tags: ['.item-tags li']
}])
.run(function(err, array) {
// array is all the "items":
//
// [
// {
// link: 'http://ift.tt/1xIsboY',
// thumb: 'http://www.google.com/s2/favicons?domain=http://ift.tt/1xIsboY',
// content: {
// title: 'The 100 Best Children\'s Books of All Time',
// body: 'Relive your childhood...'
// },
// tags: [ 'twitter' ]
// },
// {
// ...
// }
// ]
});
Add a plugin to augment Xray's current functionality.
Here's how to use the PhantomJS driver:
var phantom = require('x-ray-phantom');
xray('http://google.com')
.use(phantom(options))
This tells x-ray whether or not to throw if it encounters an error while parsing. Defaults to throwing (true
).
xray('https://github.com/')
.throws(false)
Crawl the website by passing a selector that contains a URL to the next or previous page:
xray('https://github.com/')
.paginate('.next[href]')
You can just as easily go backwards:
xray('https://github.com/')
.paginate('.prev[href]')
When paginating, this will delay the next request randomly between from
and to
milliseconds.
xray('http://github.com')
.paginate('.next')
// delays grabbing the next page for 5 to 10 seconds
.delay(5000, 10000)
If you only pass from
, it will delay exactly from
milliseconds.
xray('http://github.com')
.paginate('.next')
// delays grabbing the next page for 5 seconds
.delay(5000)
You can prepare the data that you scrape for output
function uppercase(str) {
return str.toUpperCase();
}
xray('mat.io')
.prepare('uppercase', uppercase)
.select('title | uppercase')
.run(function(err, title) {
// title == MAT.IO
});
You can also pass in objects:
var prepare = {
uppercase: function (str) {
return str.toUpperCase();
}
}
xray('mat.io')
.prepare(prepare)
.select('title | uppercase')
.run(function(err, title) {
// title == MAT.IO
});
Specify a custom formatting function for each selected element.
xray('https://github.com/stars/matthewmueller')
.select([{
$root: '.repo-list-item',
title: '.repo-list-name',
link: '.repo-list-name a[href]',
}])
.format(function(obj) {
return mustache('<a href="{{link}}">{{title}}</a>', obj);
})
.run(function(err, array) {
var html = array.join('<br/>');
});
TODO
: specify an "end", so you can do xray.format(html)
and get back html.
When paginating, this specifies a limit to the number of pages x-ray should crawl. Defaults to no limit (Infinity
).
Start the scraper, calling fn
when we're done scraping.
xray('http://google.com')
.select('title')
.run(function(err, title) {
// title is "Google"
});
If no fn
is present, we can yield on run
.
var title = yield xray('http://google.com').select('title').run();
// title is "Google"
Start the scraper, streaming each page to filepath
. Returns a WritableStream
.
xray('http://google.com')
.select('title')
.write('out.json')
.on('error', error)
.on('close', function() {
console.log('all done');
})
xray('http://google.com')
.select('title')
.write(process.stdout);
- Scraping is illegal!
Actually it's not. Scraping is not illegal in the same way that BitTorrent the protocol is not illegal.
It depends on how you use it. In fact, Google is basically one big scraping company. They follow the robots.txt
to know what they can and cannot scrape. You should make sure that you are permitted to scrape the content before scraping.
- How do you select elements?
I use the wonderful SelectorGadget Chrome Extension.
To run the tests, run:
npm install
make test
- Logo uses a modified version of XOXO's Network.
- Segment's Nightmare provides the spine for the PhantomJS driver.
(The MIT License)
Copyright (c) 2014 Matthew Mueller <matt@lapwinglabs.com>
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the 'Software'), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.