This repository has been archived by the owner on Jan 11, 2023. It is now read-only.
-
-
Notifications
You must be signed in to change notification settings - Fork 432
Adds a sapper extract
CLI command, which scrapes the server to run …
#66
Merged
Merged
Changes from 1 commit
Commits
Show all changes
12 commits
Select commit
Hold shift + click to select a range
30ddb3d
Adds a `sapper extract` CLI command, which scrapes the server to run …
freedmand d08f9eb
Fixes funky indentation in extraction unit test
freedmand fc8280a
Fixes small issue with reading chunk files
freedmand 7588911
Removes all async/await from the extraction pipeline, and adds unit t…
freedmand 9ea4137
Add option to extract server-side routes at directories other than /api.
freedmand 33b6450
Merge branch 'master' into master
Rich-Harris 304c060
Merge branch 'master' into master
Rich-Harris d6dda37
typo
Rich-Harris 6a4dc19
Enforce prod mode, return a Promise so it can be used programmatically
Rich-Harris 2af2ab3
Ensure output dir exists, return Promise
Rich-Harris f27b797
Build before extracting
Rich-Harris 50011e2
always print stdout/stderr, to avoid wild goose chase debugging
Rich-Harris File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,223 @@ | ||
const fs = require('fs-extra'); | ||
const app = require('express')(); | ||
const compression = require('compression'); | ||
const sapper = require('../index.js'); | ||
const static = require('serve-static'); | ||
const Spider = require('node-spider'); | ||
const path = require('path'); | ||
|
||
const { PORT = 3000, OUTPUT_DIR = 'dist' } = process.env; | ||
const { dest = sapperDest } = require('../config.js'); | ||
|
||
const prefix = `http://localhost:${PORT}`; | ||
|
||
/** | ||
* Returns the full URL of the specified path in the server. | ||
* @param {string} url The path for which to get the complete URL. | ||
* @return {string} The full URL. | ||
*/ | ||
function getFullUrl(url) { | ||
if (url.startsWith(prefix)) return url; | ||
return `${prefix}${url}`; | ||
} | ||
|
||
/** | ||
* Returns the extension on the URL or '' if there is none. | ||
* @param {string} url The URL. | ||
* @return {string} The URL's extension or the empty string if the URL has no | ||
* extension. | ||
*/ | ||
function getExtension(url) { | ||
const splits = url.split('.'); | ||
let extension = splits[splits.length - 1].trim(); | ||
if (!/^[a-zA-Z0-9]+$/.test(extension) || extension.length > 10) { | ||
// Clear the extension if it is not alphanumeric or is long enough to | ||
// signify it may just be a hash value or something. | ||
extension = ''; | ||
} | ||
return extension; | ||
} | ||
|
||
/** | ||
* Returns the relative path for the specified URL, adding index.html if the URL | ||
* ends in `/`. This makes the URL function well in a static site. | ||
* @param {string} url The URL for which to retrieve the relative path. | ||
* @return {string} A URL that starts with / that is relative to the server | ||
* root. The URL will add index.html if it ends with `/`. | ||
*/ | ||
function relativePath(url) { | ||
if (url.startsWith(prefix)) return relativePath(url.substr(prefix.length)); | ||
if (url.endsWith('/')) url += 'index.html'; | ||
if (getExtension(url) == '') url += '/index.html'; | ||
if (url.startsWith('/')) return url; | ||
throw new Error('Bad url'); | ||
} | ||
|
||
/** | ||
* Returns the Sapper API route for the specified URL path. | ||
* @param {string} url The absolute or relative URL. | ||
* @return {string} The URL with /api/ in front. | ||
*/ | ||
function apiPath(url) { | ||
if (url.startsWith(prefix)) { | ||
return `${prefix}/api${url.substr(prefix.length)}`; | ||
} | ||
return `/api${url}`; | ||
} | ||
|
||
/** | ||
* Returns whether the specified URL is on the server or an external link. | ||
* @param {string} url The URL. | ||
* @return {boolean} True if the URL is on the server. | ||
*/ | ||
function filter(url) { | ||
return url.startsWith('/') || url.startsWith(getFullUrl('/')); | ||
} | ||
|
||
/** | ||
* Retrieves chunk files that are normally cached for offline use in the service | ||
* worker. | ||
* @return {!Array<string>} | ||
*/ | ||
function getChunkFiles() { | ||
const clientInfo = | ||
fs.readJsonSync(path.join(sapperDest, 'stats.client.json')); | ||
const chunkFiles = clientInfo.assets.map(chunk => `/client/${chunk.name}`); | ||
return chunkFiles; | ||
} | ||
|
||
/** | ||
* Exports the Sapper app as a static website by starting at the root and | ||
* crawling pages that are linked, their /api/ pages, and webpack routes, as | ||
* well as copying assets. | ||
* @param {?Array<string>=} includeUrls If non-null, a set of additional URLs to | ||
* scrape in the extraction. This should only be set if there are routes | ||
* that cannot be reached from the root. | ||
* @param {?Array<string>=} excludeUrls If non-null, a set of URLs to avoid | ||
* scraping in the extraction. | ||
* @param {number=} extractionDir The directory in which to place the extracted | ||
* output. | ||
*/ | ||
module.exports = async function(includeUrls = null, excludeUrls = null, | ||
extractionDir = OUTPUT_DIR) { | ||
// Set up the server. | ||
|
||
// this allows us to do e.g. `fetch('/api/blog')` on the server | ||
const fetch = require('node-fetch'); | ||
global.fetch = (url, opts) => { | ||
if (url[0] === '/') url = `http://localhost:${PORT}${url}`; | ||
return fetch(url, opts); | ||
}; | ||
|
||
app.use(compression({ threshold: 0 })); | ||
|
||
app.use(static('assets')); | ||
|
||
app.use(sapper()); | ||
|
||
// Clean the output directory and copy assets in. | ||
fs.removeSync(extractionDir); | ||
fs.copySync('assets', extractionDir); | ||
|
||
// If exclude URLs are set, normalize them. | ||
if (excludeUrls == null) excludeUrls = []; | ||
excludeUrls = excludeUrls.map((url) => getFullUrl(url)); | ||
|
||
// The crux of the extraction, chaining the traditional server call with a web | ||
// scraper. The program automatically exits after all the static pages have | ||
// been scraped from the server that are accessible from the root page (`/`). | ||
const extractedFiles = []; // keep track of extracted files. | ||
const server = await app.listen(PORT); | ||
console.log(`listening on port ${PORT} and beginning extraction`); | ||
|
||
return new Promise((resolve, reject) => { | ||
const spider = new Spider({ | ||
concurrent: 5, | ||
delay: 0, | ||
logs: process.stderr, | ||
allowDuplicates: false, | ||
catchErrors: true, | ||
addReferrer: false, | ||
xhr: false, | ||
keepAlive: false, | ||
error: (err, url) => { | ||
console.error(`ERROR ${err} at ${url}`); | ||
reject(); | ||
}, | ||
// Called when there are no more requests | ||
done: async () => { | ||
await server.close(); | ||
console.log('Done!'); | ||
resolve(); | ||
}, | ||
|
||
headers: { 'user-agent': 'node-spider' }, | ||
// Use a binary encoding to preserve image files. | ||
encoding: 'binary' | ||
}); | ||
|
||
// The primary logic to handle a scraped page. | ||
const handleRequest = (doc) => { | ||
// Only deal with the page if it is on the server, i.e. it is not an | ||
// external link. | ||
if (!filter(doc.url)) return; | ||
// Skip URL if it is in the exclude list. | ||
if (excludeUrls.includes(getFullUrl(doc.url))) return; | ||
|
||
// Grab the page's relative path and write the page contents to a local | ||
// file. | ||
const relPath = relativePath(doc.url); | ||
extractedFiles.push(relPath); | ||
console.log(`GOT ${relPath}`); // static page url | ||
fs.outputFileSync(path.join(extractionDir, relPath), doc.res.body, | ||
{encoding: 'binary'}); | ||
|
||
/** | ||
* Resolves and checks if a given URL is local; if so, adds it to the | ||
* scraping queue. | ||
* @param {string} url The URL to process. | ||
*/ | ||
const process = (url) => { | ||
// Remove trailing hash if relevant. | ||
url = url.split('#')[0]; | ||
// Resolve URL relative to server root. | ||
url = doc.resolve(url); | ||
// Crawl more if the URL is on the server. | ||
if (filter(url)) spider.queue(url, handleRequest); | ||
}; | ||
|
||
const extension = getExtension(relPath); | ||
if (extension == 'html') { | ||
// Grab src and href attributes from html pages. | ||
doc.$('[src]').each((i, elem) => { | ||
process(doc.$(elem).attr('src')); | ||
}); | ||
doc.$('[href]').each((i, elem) => { | ||
process(doc.$(elem).attr('href')); | ||
}); | ||
} | ||
|
||
if (doc.url.endsWith('/service-worker.js')) { | ||
// Grab additional routes. | ||
const chunkFiles = getChunkFiles(); | ||
chunkFiles.forEach( | ||
(url) => spider.queue(getFullUrl(url), handleRequest)); | ||
} | ||
|
||
if (relPath.endsWith('/index.html') && !relPath.startsWith('/api/')) { | ||
// Attempt to grab the /api/ version of a page that seems to be a | ||
// basic route. | ||
spider.queue(apiPath(doc.url), handleRequest); | ||
} | ||
}; | ||
|
||
// Start crawling with the document root and the service worker. | ||
spider.queue(getFullUrl('/'), handleRequest); | ||
spider.queue(getFullUrl('/service-worker.js'), handleRequest); | ||
|
||
if (includeUrls !== null) { | ||
includeUrls.forEach( | ||
(url) => spider.queue(getFullUrl(url), handleRequest)); | ||
} | ||
}); | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Super cool stuff! There is an issue to move away from
async/await
. Maybe it's best to preemptively not use it?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks! Hmm, there are a lot of
await
calls in Sapper already. I'm happy to change out myawait
andasync
calls, but do you think the onus of not being able to run on platforms like AWS Lambda because of the Node version is on the platforms to fix?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@Rich-Harris has more insight on that than me, but I can definitely see the appeal of supporting lower versions of Node.
I just have all the
async/await
intest.js
left to replace in a PR, so no worries!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've removed
async/await
in my changes to keep you from having to do the work :)