-
Notifications
You must be signed in to change notification settings - Fork 1
/
urlSearcher.php
110 lines (92 loc) · 2.31 KB
/
urlSearcher.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
<?php
require_once("./result.php");
require_once("./term.php");
class UrlSearcher {
private $deepest;
private $goodLinks;
private $terms;
function __construct($searchTerms, $depth = 10) {
$this->terms = $searchTerms;
$this->deepest = $depth;
$this->goodLinks = array();
}
public function search($site, $depth) {
if($depth==$this->deepest) {
return;
}
echo "CURRENT DEPTH ".$depth." SITE ".$site."\n";
$html = file_get_contents($site);
$pageResult = new Result($site);
$dom = new DOMDocument();
@$dom->loadHTML($html);
$goodFlag = 0;
foreach($this->terms as $term) {
$termCount = $this->countWord($html, $term);
$goodFlag+=$termCount;
if($termCount>0) {
$term = new Term($term, $termCount);
$pageResult->addTerm($term);
}
}
if($goodFlag!=0) {
array_push($this->goodLinks, $pageResult);
}
$otherUrls = $this->uniqueURLS($this->getOtherUrls($dom));
$count = 0;
foreach($otherUrls as $url) {
if(preg_match('/techcrunch.com\/2012/',$url) && !strpos($url, '#comments')) {
$this->search($url, $depth+1);
$count++;
}
if($count > (20/($depth+1))) {
break;
}
}
}
private function countWord($page, $word) {
$bodyPos = strpos($page, '<body>');
$afterBody = substr($page, $bodyPos);
return count(preg_split("/".$word."/", $afterBody))-1;
}
private function getOtherUrls($dom) {
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//a");
return $hrefs;
}
private function uniqueURLS($urls) {
$u = array();
for ($i = 0; $i < $urls->length; $i++) {
$href = $urls->item($i);
$url = $href->getAttribute('href');
$u[] = $url;
}
return array_unique($u);
}
public function sortLinks() {
$gL = $this->goodLinks;
$length = count($gL)-1;
$flag = 1;
while($flag!=0) {
$flag = 0;
for($i=0; $i<$length; $i++) {
if($gL[$i]->totalCount < $gL[$i+1]->totalCount) {
$temp = $gL[$i];
$gL[$i] = $gL[$i+1];
$gL[$i+1] = $temp;
$flag = 1;
}
}
}
$this->goodLinks = $gL;
}
public function showTop10() {
for($i = 0; $i<10; $i++) {
echo $this->goodLinks[$i]->getSite()."\n";
foreach($this->goodLinks[$i]->getTerms() as $term) {
echo "TERM: ".$term->name." COUNT: ".$term->count."\n";
}
echo "\n";
}
}
}
?>