Skip to content

Commit

Permalink
allow nlp plugin use and update docs
Browse files Browse the repository at this point in the history
  • Loading branch information
Frank MacDonald authored and Frank MacDonald committed Jul 9, 2020
1 parent 8a8fc35 commit 151ad0b
Show file tree
Hide file tree
Showing 6 changed files with 3,185 additions and 4,918 deletions.
44 changes: 43 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,9 @@ var options = {
// puppeteer goto options (https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md#pagegotourl-options)
goto: {
waitUntil: 'domcontentloaded'
}
},
// Ignore content security policy
setBypassCSP: true
},
title: {
useBestTitlePart: false, // true turns on the title processing
Expand Down Expand Up @@ -160,15 +162,55 @@ there are some additional "complex" options available

```
var options = {
// array of html elements to stip before analysis
striptags: [],
// array of resource types to block e.g. ['image' ]
blockedResourceTypes: [],
// array of resource source names (all resources from
// these sources are skipped) e.g. [ 'google', 'facebook' ]
skippedResources: [],
// readability options (https://ghub.io/node-readability)
readability: {},
// retext spell options (https://ghub.io/retext-spell)
retextspell: {}
// compromise nlp options
nlp: { plugins: [ myPlugin, anotherPlugin ] }
}
```

### Using Compromise plugins to improve results

Compromise is the natural language processor that allows `horseman-article-parser` to return
topics e.g. people, places & organisations. You can now pass custom plugins to compromise to modify or add to the word lists like so.
This allows us to match - for example - names which are not in the base compromise word lists.

```
/** add some names
let testPlugin = function(Doc, world) {
world.addWords({
'rishi': 'FirstName',
'sunak': 'LastName',
})
}
const options = {
url: 'https://www.theguardian.com/commentisfree/2020/jul/08/the-guardian-view-on-rishi-sunak-right-words-right-focus-wrong-policies',
enabled: ['lighthouse', 'screenshot', 'links', 'sentiment', 'entities', 'spelling', 'keywords'],
nlp: {
plugins: [testPlugin]
}
}
```

Check out the compromise plugin [docs](https://observablehq.com/@spencermountain/compromise-plugins) for more info.

## Development

Please feel free to fork the repo or open pull requests to the development branch. I've used [eslint](https://eslint.org/) for linting.
Expand Down
12 changes: 12 additions & 0 deletions helpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ module.exports.setDefaultOptions = function (options) {
}
}

if (!options.puppeteer.hasOwnProperty('setBypassCSP')) {
options.puppeteer.setBypassCSP = true
}

if (!options.hasOwnProperty('striptags')) {
options.striptags = []
}
Expand All @@ -48,6 +52,14 @@ module.exports.setDefaultOptions = function (options) {
options.title = {}
}

if (!options.hasOwnProperty('nlp')) {
options.nlp = {}
}

if (!options.nlp.hasOwnProperty('plugins')) {
options.nlp.plugins = []
}

return options
}

Expand Down
9 changes: 8 additions & 1 deletion index.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,13 @@ module.exports.parseArticle = async function (options, socket) {

options = helpers.setDefaultOptions(options)

// Allow nlp plugins to be passed in (https://observablehq.com/@spencermountain/compromise-plugins)
if (options.nlp.plugins.length >= 1) {
for (const plugin of options.nlp.plugins) {
nlp.extend(plugin)
}
}

const actions = [articleParser(options, socket)]

if (options.enabled.includes('lighthouse')) {
Expand Down Expand Up @@ -83,7 +90,7 @@ const articleParser = async function (options, socket) {
const page = await browser.newPage()

// Ignore content security policies
page.setBypassCSP(true)
await page.setBypassCSP(options.puppeteer.setBypassCSP)

await page.setRequestInterception(true)

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "horseman-article-parser",
"version": "0.8.4",
"version": "0.8.5",
"description": "Web Page Inspection Tool. Sentiment Analysis, Keyword Extraction, Named Entity Recognition & Spell Check",
"main": "index.js",
"scripts": {
Expand Down
16 changes: 13 additions & 3 deletions test.js
Original file line number Diff line number Diff line change
@@ -1,10 +1,20 @@
const parser = require('./index.js')
const fs = require('fs')

/** add some names | https://observablehq.com/@spencermountain/compromise-plugins */
const testPlugin = function (Doc, world) {
world.addWords({
rishi: 'FirstName',
sunak: 'LastName'
})
}

const options = {
url: 'https://www.theguardian.com/uk-news/2020/jul/08/rishi-sunak-unveils-stamp-duty-holiday-and-hospitality-vat-cut-furloughed-coronavirus',
enabled: ['lighthouse', 'screenshot', 'links', 'sentiment', 'entities', 'spelling', 'keywords']
// enabled: ['links', 'sentiment', 'entities', 'spelling', 'keywords']
url: 'https://www.theguardian.com/commentisfree/2020/jul/08/the-guardian-view-on-rishi-sunak-right-words-right-focus-wrong-policies',
enabled: ['lighthouse', 'screenshot', 'links', 'sentiment', 'entities', 'spelling', 'keywords'],
nlp: {
plugins: [testPlugin]
}
}

parser.parseArticle(options)
Expand Down
Loading

0 comments on commit 151ad0b

Please sign in to comment.