-
Notifications
You must be signed in to change notification settings - Fork 0
/
dom.js
146 lines (115 loc) · 4.5 KB
/
dom.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
// to be filled with NodeLink objects
var linkColons = ["tel:", "fax:", "sms:", "mailto:", "javascript", "ts3server:",
"wtai:", "market:", "geopoint:", "im:", " msnim:", "ymsgr:", "gtalk:", "sip:", "whatsapp:"];
function onlyUnique(value, index, self)
{
return self.indexOf(value) === index;
}
function findPageInfo(html, address)
{
var theLinks = [], tempLinks =[];
let parser = new DOMParser();
const url = new URL(address);
let hostDomain = "https://" + url.hostname ;
let linkText="", isThere = false;
// parse the html
try
{
var xmlDoc = parser.parseFromString(html,"text/html");
}
catch (error)
{
console.log("no html!");
return "no html!"
}
// remove scripts & style from contents
let scripts = [...xmlDoc.getElementsByTagName('script'), ...xmlDoc.getElementsByTagName('style'), ...xmlDoc.getElementsByTagName('noscript')];
scripts.forEach(stuff => stuff.remove());
// acquire links, title, meta info & the body content
var links = [...xmlDoc.getElementsByTagName("a")];
var theTitle = [...xmlDoc.getElementsByTagName("title")];
var pageTitle = theTitle[0].innerHTML;
allOfIt = xmlDoc.body.innerHTML;
if(!pageTitle)
pageTitle = "(no title given)";
//console.log(metaDescription);
theLinks = acquireLinks(links, hostDomain); // an [] of unique nodeLinks for the site.
for(let link of theLinks)
linkText += link.href;
document.getElementById("links").innerHTML = "Links: " + theLinks.length;
document.getElementById("pagelinks").innerHTML = linkText;
document.getElementById("title").innerHTML = pageTitle;
//document.getElementById("content").innerHTML = allOfIt ;
for (let link of theLinks)
{
isThere = false;
for(let temp of tempLinks)
{
if(link.path === temp.path)
{
isThere = true;
break;
}
}
if(!isThere)
tempLinks.push(link);
}
theLinks = tempLinks;
return theLinks;
//document.getElementById("contentLegend").innerHTML = "Words: " + allOfIt.match('/(\w+)/g').length;
}
// greater sophiscation now: returns pathname, url & href link via new "NodeLink" class
// however, support for relative links is still rudimentary
function acquireLinks(links, hostDomain)
{
let linkCollection = [], myLinks = [];
let uniqueLinks, linkColonFound;
let browserTool = "http://localhost/linkGraph/linkGraph.html?url=";
let href, fullURL;
for (let link of links) {
if(typeof link.attributes.href !== "undefined")
myLinks.push( link.attributes.href.textContent);
}
uniqueLinks = myLinks.filter(onlyUnique).sort(); // see call back function - onlyUnique
for (let unique of uniqueLinks)
{
linkColonFound = false;
for (let colon of linkColons)
if(unique.substr(0, colon.length) == colon)
{
linkColonFound = true;
break;
}
if(linkColonFound)
continue;
if(unique.charAt(0) == "#")
continue; // ignore anchors
if(unique.substr( 0, 4) == "http")
{
href = "<li><a href='" + browserTool + unique + "'>" + unique + "</a></li>"
linkCollection.push(new NodeLink((new URL(unique)).pathname, unique, href));
continue;
}
if(unique.charAt(0) == "/" && unique.charAt(1) == "/")
{
href = "<li><a href='" + browserTool + 'https:' + unique + "'>" + unique + "</a></li>";
fullURL = 'https:' + unique;
linkCollection.push(new NodeLink((new URL(fullURL)).pathname, fullURL, href));
continue;
}
if(unique.charAt(0) == "/")
{
href = "<li><a href='" + browserTool + hostDomain + unique + "'>" + unique+ "</a></li>";
fullURL = hostDomain + unique;
linkCollection.push(new NodeLink((new URL(fullURL)).pathname, fullURL, href));
}
else
{
href = "<li><a href='" + browserTool + hostDomain + "/" + unique+ "'>" + unique + "</a></li>"
fullURL = hostDomain + "/" + unique;
linkCollection.push(new NodeLink((new URL(fullURL)).pathname, fullURL, href));
// this wont work for folders and link with no / at front, alas.
}
}
return linkCollection;
}