Skip to content

Commit

Permalink
Use parse5 as a default parser (closes #863)
Browse files Browse the repository at this point in the history
  • Loading branch information
inikulin authored and fb55 committed Mar 9, 2018
1 parent 89e7d1c commit c1b944e
Show file tree
Hide file tree
Showing 8 changed files with 154 additions and 32 deletions.
7 changes: 4 additions & 3 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ __ϟ Blazingly fast:__
Cheerio works with a very simple, consistent DOM model. As a result parsing, manipulating, and rendering are incredibly efficient. Preliminary end-to-end benchmarks suggest that cheerio is about __8x__ faster than JSDOM.

__❁ Incredibly flexible:__
Cheerio wraps around @FB55's forgiving [htmlparser2](https://github.com/fb55/htmlparser2/). Cheerio can parse nearly any HTML or XML document.
Cheerio wraps around [parse5](https://github.com/inikulin/parse5) parser and can optionally use @FB55's forgiving [htmlparser2](https://github.com/fb55/htmlparser2/). Cheerio can parse nearly any HTML or XML document.

## Cheerio is not a web browser

Expand Down Expand Up @@ -269,14 +269,15 @@ const $ = cheerio.load('<ul id="fruits">...</ul>', {
});
```

These parsing options are taken directly from [htmlparser2](https://github.com/fb55/htmlparser2/wiki/Parser-options), therefore any options that can be used in `htmlparser2` are valid in cheerio as well. The default options are:
These parsing options are taken directly from [htmlparser2](https://github.com/fb55/htmlparser2/wiki/Parser-options), therefore any options that can be used in `htmlparser2` are valid in cheerio as well. If any of these options is set to non-default value cheerio will implicitly use `htmlparser2` as an underlying parser. In addition, you can use `useHtmlParser2` option to force cheerio use `htmlparser2` instead of `parse5`. The default options are:

```js
{
withDomLvl1: true,
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: true
decodeEntities: true,
useHtmlParser2: false
}

```
Expand Down
6 changes: 3 additions & 3 deletions lib/api/attributes.js
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ exports.attr = function(name, value) {
var getProp = function (el, name) {
if (!el || !isTag(el)) return;

return el.hasOwnProperty(name)
return hasOwn.call(el, name)
? el[name]
: rboolean.test(name)
? getAttr(el, name) !== undefined
Expand Down Expand Up @@ -270,7 +270,7 @@ exports.val = function(value) {
returnValue;
if (option === undefined) return undefined;
if (!querying) {
if (!this.attr().hasOwnProperty('multiple') && typeof value == 'object') {
if (!hasOwn.call(this.attr(), 'multiple') && typeof value == 'object') {
return this;
}
if (typeof value != 'object') {
Expand All @@ -283,7 +283,7 @@ exports.val = function(value) {
return this;
}
returnValue = option.attr('value');
if (this.attr().hasOwnProperty('multiple')) {
if (hasOwn.call(this.attr(), 'multiple')) {
returnValue = [];
domEach(option, function(__, el) {
returnValue.push(getAttr(el, 'value'));
Expand Down
14 changes: 2 additions & 12 deletions lib/cheerio.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

var parse = require('./parse'),
defaultOptions = require('./defaults'),
isHtml = require('./utils').isHtml,
_ = {
extend: require('lodash/assignIn'),
Expand Down Expand Up @@ -30,7 +31,7 @@ var api = [
var Cheerio = module.exports = function(selector, context, root, options) {
if (!(this instanceof Cheerio)) return new Cheerio(selector, context, root, options);

this.options = _.defaults(options || {}, this.options);
this.options = _.defaults(options, this.options, defaultOptions);

// $(), $(null), $(undefined), $(false)
if (!selector) return this;
Expand Down Expand Up @@ -98,17 +99,6 @@ _.extend(Cheerio, require('./static'));

Cheerio.prototype.cheerio = '[cheerio object]';

/*
* Cheerio default options
*/

Cheerio.prototype.options = {
withDomLvl1: true,
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: true
};

/*
* Make cheerio an array-like object
*/
Expand Down
11 changes: 11 additions & 0 deletions lib/defaults.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
/*
* Cheerio default options
*/

module.exports = {
withDomLvl1: true,
normalizeWhitespace: false,
xmlMode: false,
decodeEntities: true,
useHtmlParser2: false
};
24 changes: 20 additions & 4 deletions lib/parse.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
/*
Module Dependencies
*/
var htmlparser = require('htmlparser2');
var htmlparser = require('htmlparser2'),
parse5 = require('parse5'),
defaultOptions = require('./defaults'),
isEqual = require('lodash/isEqual');

/*
Parser
Expand All @@ -12,20 +15,33 @@ exports = module.exports = function(content, options) {
root = exports.evaluate('<root></root>', options)[0];

root.type = 'root';
root.parent = null;

// Update the dom using the root
exports.update(dom, root);

return root;
};

function parseWithParse5 (content) {
var parseAsDocument = /^(\s|<!--.*?-->)*?<(!doctype|html|head|body)(.*?)>/i.test(content),
parse = parseAsDocument ? parse5.parse : parse5.parseFragment,
root = parse(content, { treeAdapter: parse5.treeAdapters.htmlparser2 });

return root.children;
}

exports.evaluate = function(content, options) {
// options = options || $.fn.options;

var dom;
var dom,
useParse5 = isEqual(options, defaultOptions);

if (Buffer.isBuffer(content))
content = content.toString();

if (typeof content === 'string' || Buffer.isBuffer(content)) {
dom = htmlparser.parseDOM(content, options);
if (typeof content === 'string') {
dom = useParse5 ? parseWithParse5(content) : htmlparser.parseDOM(content, options);
} else {
dom = content;
}
Expand Down
7 changes: 3 additions & 4 deletions lib/static.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

var serialize = require('dom-serializer'),
defaultOptions = require('./defaults'),
select = require('css-select'),
parse = require('./parse'),
_ = {
Expand All @@ -17,7 +18,7 @@ var serialize = require('dom-serializer'),
exports.load = function(content, options) {
var Cheerio = require('./cheerio');

options = _.defaults(options || {}, Cheerio.prototype.options);
options = _.defaults(options || {}, defaultOptions);

var root = parse(content, options);

Expand Down Expand Up @@ -75,8 +76,6 @@ function render(that, dom, options) {
*/

exports.html = function(dom, options) {
var Cheerio = require('./cheerio');

// be flexible about parameters, sometimes we call html(),
// with options as only parameter
// check dom argument for dom element specific properties
Expand All @@ -89,7 +88,7 @@ exports.html = function(dom, options) {

// sometimes $.html() used without preloading html
// so fallback non existing options to the default ones
options = _.defaults(options || {}, this._options, Cheerio.prototype.options);
options = _.defaults(options || {}, this._options, defaultOptions);

return render(this, dom, options);
};
Expand Down
5 changes: 3 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,16 @@
"dom-serializer": "~0.1.0",
"entities": "~1.1.1",
"htmlparser2": "^3.9.1",
"lodash": "^4.15.0"
"lodash": "^4.15.0",
"parse5": "^3.0.1"
},
"devDependencies": {
"benchmark": "^2.1.0",
"coveralls": "^2.11.9",
"expect.js": "~0.3.1",
"istanbul": "^0.4.3",
"jsdom": "^9.2.1",
"jquery": "^3.0.0",
"jsdom": "^9.2.1",
"jshint": "^2.9.2",
"mocha": "^3.1.2",
"xyz": "~1.1.0"
Expand Down
112 changes: 108 additions & 4 deletions test/parse.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
var expect = require('expect.js'),
parse = require('../lib/parse'),
defaultOpts = require('..').prototype.options;
defaults = require('lodash/defaults'),
defaultOpts = require('../lib/defaults');


// Tags
Expand Down Expand Up @@ -46,7 +47,7 @@ describe('parse', function() {
var tag = parse.evaluate(basic, defaultOpts)[0];
expect(tag.type).to.equal('tag');
expect(tag.tagName).to.equal('html');
expect(tag.childNodes).to.be.empty();
expect(tag.childNodes).to.have.length(2);
});

it('should handle sibling tags: ' + siblings, function() {
Expand Down Expand Up @@ -78,7 +79,15 @@ describe('parse', function() {
expect(tag.type).to.equal('tag');
expect(tag.tagName).to.equal('html');
expect(tag.childNodes).to.be.ok();
expect(tag.childNodes[1].tagName).to.equal('body');
expect(tag.childNodes[1].childNodes).to.have.length(1);

tag = parse.evaluate(children, defaults({ useHtmlParser2: true }, defaultOpts))[0];
expect(tag.type).to.equal('tag');
expect(tag.tagName).to.equal('html');
expect(tag.childNodes).to.be.ok();
expect(tag.childNodes).to.have.length(1);
expect(tag.childNodes[0].tagName).to.equal('br');
});

it('should handle tags with children: ' + li, function() {
Expand Down Expand Up @@ -141,7 +150,7 @@ describe('parse', function() {
it('should handle directives: ' + directive, function() {
var elem = parse.evaluate(directive, defaultOpts)[0];
expect(elem.type).to.equal('directive');
expect(elem.data).to.equal('!doctype html');
expect(elem.data).to.equal('!DOCTYPE html');
expect(elem.tagName).to.equal('!doctype');
});

Expand Down Expand Up @@ -209,7 +218,7 @@ describe('parse', function() {
it('should add root to: ' + directive, function() {
var root = parse(directive, defaultOpts);
rootTest(root);
expect(root.childNodes).to.have.length(1);
expect(root.childNodes).to.have.length(2);
expect(root.childNodes[0].type).to.equal('directive');
});

Expand Down Expand Up @@ -247,6 +256,101 @@ describe('parse', function() {
expect(childNodes[2].firstChild).to.be(null);
expect(childNodes[2].lastChild).to.be(null);
});

it('Should parse less than or equal sign sign', function() {
var root = parse('<i>A</i><=<i>B</i>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes[0].tagName).to.be('i');
expect(childNodes[0].childNodes[0].data).to.be('A');
expect(childNodes[1].data).to.be('<=');
expect(childNodes[2].tagName).to.be('i');
expect(childNodes[2].childNodes[0].data).to.be('B');
});

it('Should ignore unclosed CDATA', function() {
var root = parse('<a></a><script>foo //<![CDATA[ bar</script><b></b>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes[0].tagName).to.be('a');
expect(childNodes[1].tagName).to.be('script');
expect(childNodes[1].childNodes[0].data).to.be('foo //<![CDATA[ bar');
expect(childNodes[2].tagName).to.be('b');
});

it('Should add <head> to documents', function() {
var root = parse('<html></html>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes[0].tagName).to.be('html');
expect(childNodes[0].childNodes[0].tagName).to.be('head');
});

it('Should add <head> to documents', function() {
var root = parse('<table><td>bar</td></tr></table>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes[0].tagName).to.be('table');
expect(childNodes[0].childNodes.length).to.be(1);
expect(childNodes[0].childNodes[0].tagName).to.be('tbody');
expect(childNodes[0].childNodes[0].childNodes[0].tagName).to.be('tr');
expect(childNodes[0].childNodes[0].childNodes[0].childNodes[0].tagName).to.be('td');
expect(childNodes[0].childNodes[0].childNodes[0].childNodes[0].childNodes[0].data).to.be('bar');
});

it('Should parse custom tag <line>', function() {
var root = parse('<line>test</line>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes.length).to.be(1);
expect(childNodes[0].tagName).to.be('line');
expect(childNodes[0].childNodes[0].data).to.be('test');
});

it('Should properly parse misnested table tags', function() {
var root = parse('<tr><td>i1</td></tr><tr><td>i2</td></td></tr><tr><td>i3</td></td></tr>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes.length).to.be(3);

childNodes.forEach(function(child, i) {
expect(child.tagName).to.be('tr');
expect(child.childNodes[0].tagName).to.be('td');
expect(child.childNodes[0].childNodes[0].data).to.be('i' + (i + 1));
});
});

it('Should correctly parse data url attributes', function() {
var html = '<div style=\'font-family:"butcherman-caps"; src:url(data:font/opentype;base64,AAEA...);\'></div>';
var expectedAttr = 'font-family:"butcherman-caps"; src:url(data:font/opentype;base64,AAEA...);';
var root = parse(html, defaultOpts);
var childNodes = root.childNodes;

expect(childNodes[0].attribs.style).to.be(expectedAttr);
});

it('Should treat <xmp> tag content as text', function() {
var root = parse('<xmp><h2></xmp>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes[0].childNodes[0].data).to.be('<h2>');
});

it('Should correctly parse malformed numbered entities', function() {
var root = parse('<p>z&#</p>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes[0].childNodes[0].data).to.be('z&#');
});

it('Should correctly parse mismatched headings', function() {
var root = parse('<h2>Test</h3><div></div>', defaultOpts);
var childNodes = root.childNodes;

expect(childNodes.length).to.be(2);
expect(childNodes[0].tagName).to.be('h2');
expect(childNodes[1].tagName).to.be('div');
});
});

});

0 comments on commit c1b944e

Please sign in to comment.