Skip to content

Commit

Permalink
Implement #53. Various style improvements.
Browse files Browse the repository at this point in the history
  • Loading branch information
chrisnewtn committed Nov 4, 2013
1 parent d6e4a38 commit aeca44d
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 28 deletions.
58 changes: 44 additions & 14 deletions lib/grapher.js
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ Grapher.prototype = {
rootPage.verified = true;
this.pages[this.rootUrl] = rootPage;

this.fetchPages(function(err, graph){
this.fetchPages(function (err, graph) {
// log the end of the graph build
var requestTimes = _.pluck(graph.pages, 'requestTime'),
total = 0;
Expand All @@ -58,7 +58,6 @@ Grapher.prototype = {

callback(err, graph);
});

},


Expand All @@ -70,9 +69,9 @@ Grapher.prototype = {

whenPageIsFetched = function (page) {
// if the root url errors, stop the whole parse with by throwing an error
if(page && page.status === "errored" && page.url === self.rootUrl){
callback(page.errorMsg, self);
}else{
if (page && page.status === "errored" && page.url === self.rootUrl) {
callback(page.errorMsg, self);
} else {
if (self.allFetched()) {
// finished fetching all pages, execute callback.
self.verifyPages();
Expand All @@ -86,7 +85,7 @@ Grapher.prototype = {
};

findUnfetchedPages = function(){
_.each(self.pages, function (page) {
_.each(self.pages, function (page) {
if (page.status === "unfetched") {
if (page.level <= self.options.crawlLimit) {
page.fetch(whenPageIsFetched);
Expand All @@ -103,6 +102,38 @@ Grapher.prototype = {
},


replaceAliases: function () {
var didSomeReplacing = false;

_.each(this.pages, function (page, key, pages) {
page.aliases.forEach(function (alias) {
_.each(pages, function (pg, k, pgs) {
pg.links.forEach(function (link, i) {
if (link === alias) {
if (pg.links.length === 1) {
pg.links = [];
} else {
pg.links = pg.links.splice(i);
}
pg.links.push(page.url);

didSomeReplacing = true;
}
});
});
});
});

if (didSomeReplacing) {
var newPages = {};
_.each(this.pages, function (page) {
newPages[page.url] = page;
});
this.pages = newPages;
}
},


// A recursive function that checks if all pages are verified
// and attempts to verify the ones that aren't by checking
// them for links to ones that are. If at least one link is
Expand Down Expand Up @@ -135,7 +166,6 @@ Grapher.prototype = {
page.level = 0;
}
}

});

if (verifiedStuff) {
Expand All @@ -147,7 +177,7 @@ Grapher.prototype = {

// gets a page from the pages array. Ignores protocol & trailing slashes
getPage: function (url) {
if(url.indexOf('http') === 0){
if (url.indexOf('http') === 0) {
var url = url.substring(url.match(/^https?:\/\//)[0].length,url.length),
url = fn.url.removeTrailingSlash(url),
nowww = fn.url.removeWWW(url),
Expand All @@ -165,7 +195,7 @@ Grapher.prototype = {
this.pages[httpnowww] || this.pages[httpnowww + "/"] ||
this.pages[httpsnowww] || this.pages[httpsnowww + "/"];

}else{
} else {
return undefined;
}
},
Expand All @@ -182,12 +212,12 @@ Grapher.prototype = {
truthTest, propList, pages;

if (self.options.strict) {
propList = ['url', 'title', 'favicon', 'links', 'inboundCount'];
propList = ['url', 'title', 'favicon', 'links', 'inboundCount', 'aliases'];
truthTest = function (page) {
return page.verified && page.status === "fetched";
}
} else {
propList = ['url', 'title', 'favicon', 'links', 'verified', 'inboundCount'];
propList = ['url', 'title', 'favicon', 'links', 'verified', 'inboundCount', 'aliases'];
truthTest = function (page) {
return page.status === "fetched";
}
Expand Down Expand Up @@ -219,7 +249,7 @@ Grapher.prototype = {
crawled : _.size(self.pages),
verified : 0
};
}else{
} else {
rtnObj = {
results : results,
query : self.rootUrl,
Expand Down Expand Up @@ -443,7 +473,7 @@ function graph (url, options, callback) {
}


function graphIt(url){
function graphIt (url) {
grapher = new Grapher(url, options);
grapher.build(function (err, graph) {
// if we have an error reformat object
Expand Down Expand Up @@ -501,7 +531,7 @@ function getWebfinger(address, callback){


// merges passed and default options
function mergeOptions(options){
function mergeOptions (options) {
// add interface for cache and logger
options.cache = (options.cache)? options.cache : internalCache;
options.logger = (options.logger)? options.logger : internalLogger;
Expand Down
12 changes: 12 additions & 0 deletions lib/page.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ function Page (url, grapher, options, sourceUrl, level) {
this.sourceUrl = sourceUrl;
this.level = level;
this.errorMsg = '';
this.aliases = [];

// add the domain from url into count
grapher.appendDomainCount(url);
Expand All @@ -44,6 +45,13 @@ Page.prototype = {
self.links = data.links;
self.favicon = data.favicon;
self.requestTime = data.requestTime;

if (data.resolvedUrl && data.resolvedUrl !== self.url) {
self.aliases.push(self.url);
self.url = data.resolvedUrl;
}

self.grapher.replaceAliases();
self.grapher.verifyPages();
self.addPages(self.links, self.url);
self.status = "fetched";
Expand Down Expand Up @@ -120,6 +128,10 @@ Page.prototype = {
rtnObj['sourceUrl'] = this.sourceUrl;
}

if (_.contains(props, 'aliases')) {
rtnObj['urlAliases'] = this.aliases;
}

return rtnObj;
},

Expand Down
40 changes: 27 additions & 13 deletions lib/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,15 @@ function scrape (url, options, callback) {
requestTime: 0
};

logger.info('parsing: ' + url)
logger.info('parsing: ' + url);

try {
// get cached html or get html from page
if (options.cache && options.useCache && cache.has(url)) {
// from cache
logger.log('fetched html from cache: ' + url);
parseHTML(cache.get(url), 0, callback);
logger.log('fetched html from cache: ' + url);
var cachedPage = cache.get(url);
parseHTML(cachedPage.resolvedUrl, cachedPage.body, 0);
} else {
if (url) {
var startedRequest = new Date(),
Expand All @@ -37,9 +38,20 @@ function scrape (url, options, callback) {
request(requestObj, function(requestErrors, response, body) {
if (!requestErrors && response.statusCode === 200) {

var resolvedUrl = url;

if (response.request &&
response.request.uri.href &&
response.request.uri.href !== url) {
resolvedUrl = response.request.uri.href;
}

// add html into the cache
if (options.cache) {
cache.set(url, body)
cache.set(url, {
resolvedUrl: resolvedUrl,
body: body
});
};

var endedRequest = new Date();
Expand All @@ -48,9 +60,9 @@ function scrape (url, options, callback) {

// is the content html
if (response.headers['content-type'].indexOf('text/html') > -1) {
parseHTML(body, ms, callback);
parseHTML(resolvedUrl, body, ms);
} else {
parseOtherFormat(body, url, ms, callback);
parseOtherFormat(body, url, ms);
}

} else {
Expand All @@ -72,32 +84,35 @@ function scrape (url, options, callback) {
}
}
} catch(err) {
console.log(err.stack);
logger.warn(err + ' - ' + url);
callback(err + ' - ' + url, data);
}


// return a blank object for formats other than html
function parseOtherFormat(content, url, requestTime, callback){
var url = require('url').parse(url),
icon = url.protocol + "//" + url.host + "/favicon.ico";
function parseOtherFormat(content, url, requestTime) {
var url = require('url').parse(url),
icon = url.protocol + "//" + url.host + "/favicon.ico";

data = {
links:[],
requestTime: requestTime,
title: url.href,
favicon: icon
};
};

callback(null, data);
}


// parse the html for rel=me links
function parseHTML(html, requestTime, callback){
function parseHTML(resolvedUrl, html, requestTime) {

var startedDOMParse = new Date(),
$ = cheerio.load(html);

data.resolvedUrl = resolvedUrl;
data.requestTime = requestTime;

// get rel= me links from 'a' or 'link' tags
Expand Down Expand Up @@ -128,15 +143,14 @@ function scrape (url, options, callback) {
data.title = fn.trim($('title').text().replace(/(\r\n|\n|\r)/gm,""));

// get the favicon
data.favicon = resolveFavicon($, url);
data.favicon = resolveFavicon($, data.resolvedUrl || url);

var endedDOMParse = new Date();
var ms = endedDOMParse.getTime() - startedDOMParse.getTime();
logger.log('time to parse DOM: ' + ms + 'ms - ' + url);

callback(null, data);
}

}


Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"author": "Dharmafly (dharmafly.com)",
"name": "elsewhere",
"description": "A node project that aims to replicate the functionality of the Google Social Graph API",
"version": "0.0.4",
"version": "0.0.5",
"keywords": [
"discovery",
"spider",
Expand Down

0 comments on commit aeca44d

Please sign in to comment.