From aeca44d3dfe7cbdea8571c609eceda634202d469 Mon Sep 17 00:00:00 2001 From: Chris Newton Date: Mon, 4 Nov 2013 17:14:34 +0000 Subject: [PATCH] Implement #53. Various style improvements. --- lib/grapher.js | 58 ++++++++++++++++++++++++++++++++++++++------------ lib/page.js | 12 +++++++++++ lib/scraper.js | 40 +++++++++++++++++++++++----------- package.json | 2 +- 4 files changed, 84 insertions(+), 28 deletions(-) diff --git a/lib/grapher.js b/lib/grapher.js index 230b7d1..691e52f 100644 --- a/lib/grapher.js +++ b/lib/grapher.js @@ -43,7 +43,7 @@ Grapher.prototype = { rootPage.verified = true; this.pages[this.rootUrl] = rootPage; - this.fetchPages(function(err, graph){ + this.fetchPages(function (err, graph) { // log the end of the graph build var requestTimes = _.pluck(graph.pages, 'requestTime'), total = 0; @@ -58,7 +58,6 @@ Grapher.prototype = { callback(err, graph); }); - }, @@ -70,9 +69,9 @@ Grapher.prototype = { whenPageIsFetched = function (page) { // if the root url errors, stop the whole parse with by throwing an error - if(page && page.status === "errored" && page.url === self.rootUrl){ - callback(page.errorMsg, self); - }else{ + if (page && page.status === "errored" && page.url === self.rootUrl) { + callback(page.errorMsg, self); + } else { if (self.allFetched()) { // finished fetching all pages, execute callback. self.verifyPages(); @@ -86,7 +85,7 @@ Grapher.prototype = { }; findUnfetchedPages = function(){ - _.each(self.pages, function (page) { + _.each(self.pages, function (page) { if (page.status === "unfetched") { if (page.level <= self.options.crawlLimit) { page.fetch(whenPageIsFetched); @@ -103,6 +102,38 @@ Grapher.prototype = { }, + replaceAliases: function () { + var didSomeReplacing = false; + + _.each(this.pages, function (page, key, pages) { + page.aliases.forEach(function (alias) { + _.each(pages, function (pg, k, pgs) { + pg.links.forEach(function (link, i) { + if (link === alias) { + if (pg.links.length === 1) { + pg.links = []; + } else { + pg.links = pg.links.splice(i); + } + pg.links.push(page.url); + + didSomeReplacing = true; + } + }); + }); + }); + }); + + if (didSomeReplacing) { + var newPages = {}; + _.each(this.pages, function (page) { + newPages[page.url] = page; + }); + this.pages = newPages; + } + }, + + // A recursive function that checks if all pages are verified // and attempts to verify the ones that aren't by checking // them for links to ones that are. If at least one link is @@ -135,7 +166,6 @@ Grapher.prototype = { page.level = 0; } } - }); if (verifiedStuff) { @@ -147,7 +177,7 @@ Grapher.prototype = { // gets a page from the pages array. Ignores protocol & trailing slashes getPage: function (url) { - if(url.indexOf('http') === 0){ + if (url.indexOf('http') === 0) { var url = url.substring(url.match(/^https?:\/\//)[0].length,url.length), url = fn.url.removeTrailingSlash(url), nowww = fn.url.removeWWW(url), @@ -165,7 +195,7 @@ Grapher.prototype = { this.pages[httpnowww] || this.pages[httpnowww + "/"] || this.pages[httpsnowww] || this.pages[httpsnowww + "/"]; - }else{ + } else { return undefined; } }, @@ -182,12 +212,12 @@ Grapher.prototype = { truthTest, propList, pages; if (self.options.strict) { - propList = ['url', 'title', 'favicon', 'links', 'inboundCount']; + propList = ['url', 'title', 'favicon', 'links', 'inboundCount', 'aliases']; truthTest = function (page) { return page.verified && page.status === "fetched"; } } else { - propList = ['url', 'title', 'favicon', 'links', 'verified', 'inboundCount']; + propList = ['url', 'title', 'favicon', 'links', 'verified', 'inboundCount', 'aliases']; truthTest = function (page) { return page.status === "fetched"; } @@ -219,7 +249,7 @@ Grapher.prototype = { crawled : _.size(self.pages), verified : 0 }; - }else{ + } else { rtnObj = { results : results, query : self.rootUrl, @@ -443,7 +473,7 @@ function graph (url, options, callback) { } - function graphIt(url){ + function graphIt (url) { grapher = new Grapher(url, options); grapher.build(function (err, graph) { // if we have an error reformat object @@ -501,7 +531,7 @@ function getWebfinger(address, callback){ // merges passed and default options -function mergeOptions(options){ +function mergeOptions (options) { // add interface for cache and logger options.cache = (options.cache)? options.cache : internalCache; options.logger = (options.logger)? options.logger : internalLogger; diff --git a/lib/page.js b/lib/page.js index 68e2d4e..2a83470 100644 --- a/lib/page.js +++ b/lib/page.js @@ -19,6 +19,7 @@ function Page (url, grapher, options, sourceUrl, level) { this.sourceUrl = sourceUrl; this.level = level; this.errorMsg = ''; + this.aliases = []; // add the domain from url into count grapher.appendDomainCount(url); @@ -44,6 +45,13 @@ Page.prototype = { self.links = data.links; self.favicon = data.favicon; self.requestTime = data.requestTime; + + if (data.resolvedUrl && data.resolvedUrl !== self.url) { + self.aliases.push(self.url); + self.url = data.resolvedUrl; + } + + self.grapher.replaceAliases(); self.grapher.verifyPages(); self.addPages(self.links, self.url); self.status = "fetched"; @@ -120,6 +128,10 @@ Page.prototype = { rtnObj['sourceUrl'] = this.sourceUrl; } + if (_.contains(props, 'aliases')) { + rtnObj['urlAliases'] = this.aliases; + } + return rtnObj; }, diff --git a/lib/scraper.js b/lib/scraper.js index 822ba2e..20d8f49 100644 --- a/lib/scraper.js +++ b/lib/scraper.js @@ -17,14 +17,15 @@ function scrape (url, options, callback) { requestTime: 0 }; - logger.info('parsing: ' + url) + logger.info('parsing: ' + url); try { // get cached html or get html from page if (options.cache && options.useCache && cache.has(url)) { // from cache - logger.log('fetched html from cache: ' + url); - parseHTML(cache.get(url), 0, callback); + logger.log('fetched html from cache: ' + url); + var cachedPage = cache.get(url); + parseHTML(cachedPage.resolvedUrl, cachedPage.body, 0); } else { if (url) { var startedRequest = new Date(), @@ -37,9 +38,20 @@ function scrape (url, options, callback) { request(requestObj, function(requestErrors, response, body) { if (!requestErrors && response.statusCode === 200) { + var resolvedUrl = url; + + if (response.request && + response.request.uri.href && + response.request.uri.href !== url) { + resolvedUrl = response.request.uri.href; + } + // add html into the cache if (options.cache) { - cache.set(url, body) + cache.set(url, { + resolvedUrl: resolvedUrl, + body: body + }); }; var endedRequest = new Date(); @@ -48,9 +60,9 @@ function scrape (url, options, callback) { // is the content html if (response.headers['content-type'].indexOf('text/html') > -1) { - parseHTML(body, ms, callback); + parseHTML(resolvedUrl, body, ms); } else { - parseOtherFormat(body, url, ms, callback); + parseOtherFormat(body, url, ms); } } else { @@ -72,32 +84,35 @@ function scrape (url, options, callback) { } } } catch(err) { + console.log(err.stack); logger.warn(err + ' - ' + url); callback(err + ' - ' + url, data); } // return a blank object for formats other than html - function parseOtherFormat(content, url, requestTime, callback){ - var url = require('url').parse(url), - icon = url.protocol + "//" + url.host + "/favicon.ico"; + function parseOtherFormat(content, url, requestTime) { + var url = require('url').parse(url), + icon = url.protocol + "//" + url.host + "/favicon.ico"; data = { links:[], requestTime: requestTime, title: url.href, favicon: icon - }; + }; + callback(null, data); } // parse the html for rel=me links - function parseHTML(html, requestTime, callback){ + function parseHTML(resolvedUrl, html, requestTime) { var startedDOMParse = new Date(), $ = cheerio.load(html); + data.resolvedUrl = resolvedUrl; data.requestTime = requestTime; // get rel= me links from 'a' or 'link' tags @@ -128,7 +143,7 @@ function scrape (url, options, callback) { data.title = fn.trim($('title').text().replace(/(\r\n|\n|\r)/gm,"")); // get the favicon - data.favicon = resolveFavicon($, url); + data.favicon = resolveFavicon($, data.resolvedUrl || url); var endedDOMParse = new Date(); var ms = endedDOMParse.getTime() - startedDOMParse.getTime(); @@ -136,7 +151,6 @@ function scrape (url, options, callback) { callback(null, data); } - } diff --git a/package.json b/package.json index 0d95e14..01a7ecd 100644 --- a/package.json +++ b/package.json @@ -2,7 +2,7 @@ "author": "Dharmafly (dharmafly.com)", "name": "elsewhere", "description": "A node project that aims to replicate the functionality of the Google Social Graph API", - "version": "0.0.4", + "version": "0.0.5", "keywords": [ "discovery", "spider",