From aeca44d3dfe7cbdea8571c609eceda634202d469 Mon Sep 17 00:00:00 2001
From: Chris Newton <chrisnewtn@gmail.com>
Date: Mon, 4 Nov 2013 17:14:34 +0000
Subject: [PATCH] Implement #53. Various style improvements.

---
 lib/grapher.js | 58 ++++++++++++++++++++++++++++++++++++++------------
 lib/page.js    | 12 +++++++++++
 lib/scraper.js | 40 +++++++++++++++++++++++-----------
 package.json   |  2 +-
 4 files changed, 84 insertions(+), 28 deletions(-)

diff --git a/lib/grapher.js b/lib/grapher.js
index 230b7d1..691e52f 100644
--- a/lib/grapher.js
+++ b/lib/grapher.js
@@ -43,7 +43,7 @@ Grapher.prototype = {
     rootPage.verified = true;
     this.pages[this.rootUrl] = rootPage;
 
-    this.fetchPages(function(err, graph){
+    this.fetchPages(function (err, graph) {
       // log the end of the graph build 
       var requestTimes = _.pluck(graph.pages, 'requestTime'),
           total = 0;
@@ -58,7 +58,6 @@ Grapher.prototype = {
 
       callback(err, graph);
     });
-
   },
 
 
@@ -70,9 +69,9 @@ Grapher.prototype = {
 
     whenPageIsFetched = function (page) {
       // if the root url errors, stop the whole parse with by throwing an error 
-      if(page && page.status === "errored" && page.url === self.rootUrl){
-          callback(page.errorMsg, self);
-      }else{
+      if (page && page.status === "errored" && page.url === self.rootUrl) {
+        callback(page.errorMsg, self);
+      } else {
         if (self.allFetched()) {
           // finished fetching all pages, execute callback.
           self.verifyPages();
@@ -86,7 +85,7 @@ Grapher.prototype = {
     };
 
     findUnfetchedPages = function(){ 
-       _.each(self.pages, function (page) {
+      _.each(self.pages, function (page) {
         if (page.status === "unfetched") {
           if (page.level <= self.options.crawlLimit) {
             page.fetch(whenPageIsFetched);
@@ -103,6 +102,38 @@ Grapher.prototype = {
   },
 
 
+  replaceAliases: function () {
+    var didSomeReplacing = false;
+
+    _.each(this.pages, function (page, key, pages) {
+      page.aliases.forEach(function (alias) {
+        _.each(pages, function (pg, k, pgs) {
+          pg.links.forEach(function (link, i) {
+            if (link === alias) {
+              if (pg.links.length === 1) {
+                pg.links = [];
+              } else {
+                pg.links = pg.links.splice(i);
+              }
+              pg.links.push(page.url);
+
+              didSomeReplacing = true;
+            }
+          });
+        });
+      });
+    });
+
+    if (didSomeReplacing) {
+      var newPages = {};
+      _.each(this.pages, function (page) {
+        newPages[page.url] = page;
+      });
+      this.pages = newPages;
+    }
+  },
+
+
   // A recursive function that checks if all pages are verified
   // and attempts to verify the ones that aren't by checking
   // them for links to ones that are. If at least one link is
@@ -135,7 +166,6 @@ Grapher.prototype = {
           page.level = 0;
         }
       }
-
     });
 
     if (verifiedStuff) {
@@ -147,7 +177,7 @@ Grapher.prototype = {
 
   // gets a page from the pages array. Ignores protocol & trailing slashes
   getPage: function (url) {
-    if(url.indexOf('http') === 0){
+    if (url.indexOf('http') === 0) {
       var url   = url.substring(url.match(/^https?:\/\//)[0].length,url.length),
           url   = fn.url.removeTrailingSlash(url),
           nowww = fn.url.removeWWW(url),
@@ -165,7 +195,7 @@ Grapher.prototype = {
              this.pages[httpnowww]  || this.pages[httpnowww + "/"]  ||
              this.pages[httpsnowww] || this.pages[httpsnowww + "/"];
 
-    }else{
+    } else {
       return undefined;
     }
   },
@@ -182,12 +212,12 @@ Grapher.prototype = {
         truthTest, propList, pages;
 
     if (self.options.strict) {
-      propList  = ['url', 'title', 'favicon', 'links', 'inboundCount'];
+      propList  = ['url', 'title', 'favicon', 'links', 'inboundCount', 'aliases'];
       truthTest = function (page) {
         return page.verified && page.status === "fetched";
       }
     } else {
-      propList  = ['url', 'title', 'favicon', 'links', 'verified', 'inboundCount'];
+      propList  = ['url', 'title', 'favicon', 'links', 'verified', 'inboundCount', 'aliases'];
       truthTest = function (page) {
         return page.status === "fetched";
       }
@@ -219,7 +249,7 @@ Grapher.prototype = {
         crawled  : _.size(self.pages),
         verified : 0
       };
-    }else{
+    } else {
       rtnObj = {
         results  : results,
         query    : self.rootUrl,
@@ -443,7 +473,7 @@ function graph (url, options, callback) {
   }
 
 
-  function graphIt(url){
+  function graphIt (url) {
     grapher = new Grapher(url, options);
     grapher.build(function (err, graph) {
       // if we have an error reformat object
@@ -501,7 +531,7 @@ function getWebfinger(address, callback){
 
 
 // merges passed and default options
-function mergeOptions(options){
+function mergeOptions (options) {
   // add interface for cache and logger
   options.cache = (options.cache)? options.cache : internalCache;
   options.logger = (options.logger)? options.logger : internalLogger;
diff --git a/lib/page.js b/lib/page.js
index 68e2d4e..2a83470 100644
--- a/lib/page.js
+++ b/lib/page.js
@@ -19,6 +19,7 @@ function Page (url, grapher, options, sourceUrl, level) {
   this.sourceUrl   = sourceUrl;
   this.level       = level;
   this.errorMsg    = '';
+  this.aliases     = [];
 
   // add the domain from url into count
   grapher.appendDomainCount(url);
@@ -44,6 +45,13 @@ Page.prototype = {
           self.links = data.links;
           self.favicon = data.favicon;
           self.requestTime = data.requestTime;
+          
+          if (data.resolvedUrl && data.resolvedUrl !== self.url) {
+            self.aliases.push(self.url);
+            self.url = data.resolvedUrl;
+          }
+
+          self.grapher.replaceAliases();
           self.grapher.verifyPages();
           self.addPages(self.links, self.url);
           self.status = "fetched";
@@ -120,6 +128,10 @@ Page.prototype = {
       rtnObj['sourceUrl'] = this.sourceUrl;
     }
 
+    if (_.contains(props, 'aliases')) {
+      rtnObj['urlAliases'] = this.aliases;
+    }
+
     return rtnObj;
   },
 
diff --git a/lib/scraper.js b/lib/scraper.js
index 822ba2e..20d8f49 100644
--- a/lib/scraper.js
+++ b/lib/scraper.js
@@ -17,14 +17,15 @@ function scrape (url, options, callback) {
         requestTime: 0
       }; 
 
-  logger.info('parsing: ' + url)
+  logger.info('parsing: ' + url);
 
   try {
     // get cached html or get html from page
     if (options.cache && options.useCache && cache.has(url)) {
       // from cache
-      logger.log('fetched html from cache: ' + url); 
-      parseHTML(cache.get(url), 0, callback);
+      logger.log('fetched html from cache: ' + url);
+      var cachedPage = cache.get(url);
+      parseHTML(cachedPage.resolvedUrl, cachedPage.body, 0);
     } else {
       if (url) {
         var startedRequest = new Date(),
@@ -37,9 +38,20 @@ function scrape (url, options, callback) {
         request(requestObj, function(requestErrors, response, body) {
           if (!requestErrors && response.statusCode === 200) {
 
+            var resolvedUrl = url;
+
+            if (response.request && 
+              response.request.uri.href && 
+              response.request.uri.href !== url) {
+              resolvedUrl = response.request.uri.href;
+            }
+
             // add html into the cache
             if (options.cache) {
-              cache.set(url, body) 
+              cache.set(url, {
+                resolvedUrl: resolvedUrl,
+                body: body
+              });
             };
 
             var endedRequest = new Date();
@@ -48,9 +60,9 @@ function scrape (url, options, callback) {
 
             // is the content html
             if (response.headers['content-type'].indexOf('text/html') > -1) {
-              parseHTML(body, ms, callback);
+              parseHTML(resolvedUrl, body, ms);
             } else {
-              parseOtherFormat(body, url, ms, callback);
+              parseOtherFormat(body, url, ms);
             }
 
           } else {
@@ -72,32 +84,35 @@ function scrape (url, options, callback) {
       }
     }
   } catch(err) {
+    console.log(err.stack);
     logger.warn(err + ' - ' + url);
     callback(err + ' - ' + url, data);
   }
 
 
   // return a blank object for formats other than html
-  function parseOtherFormat(content, url, requestTime, callback){
-    var  url    = require('url').parse(url),
-         icon   = url.protocol + "//" + url.host + "/favicon.ico";
+  function parseOtherFormat(content, url, requestTime) {
+    var  url  = require('url').parse(url),
+         icon = url.protocol + "//" + url.host + "/favicon.ico";
 
     data = {
       links:[],
       requestTime: requestTime,
       title: url.href,
       favicon: icon
-    }; 
+    };
+
     callback(null, data);
   }
 
 
   // parse the html for rel=me links
-  function parseHTML(html, requestTime, callback){
+  function parseHTML(resolvedUrl, html, requestTime) {
 
     var startedDOMParse = new Date(),
         $ = cheerio.load(html);
 
+    data.resolvedUrl = resolvedUrl;
     data.requestTime = requestTime;
 
     // get rel= me links from 'a' or 'link' tags
@@ -128,7 +143,7 @@ function scrape (url, options, callback) {
     data.title = fn.trim($('title').text().replace(/(\r\n|\n|\r)/gm,""));
 
     // get the favicon
-    data.favicon = resolveFavicon($, url);
+    data.favicon = resolveFavicon($, data.resolvedUrl || url);
 
     var endedDOMParse = new Date();
     var ms = endedDOMParse.getTime() - startedDOMParse.getTime();
@@ -136,7 +151,6 @@ function scrape (url, options, callback) {
 
     callback(null, data);
   }
-
 }
 
 
diff --git a/package.json b/package.json
index 0d95e14..01a7ecd 100644
--- a/package.json
+++ b/package.json
@@ -2,7 +2,7 @@
   "author": "Dharmafly (dharmafly.com)",
   "name": "elsewhere",
   "description": "A node project that aims to replicate the functionality of the Google Social Graph API",
-  "version": "0.0.4",
+  "version": "0.0.5",
   "keywords": [
     "discovery",
     "spider",