forked from tlhunter/spidermonkey
-
Notifications
You must be signed in to change notification settings - Fork 0
/
execute.php
102 lines (94 loc) · 3.49 KB
/
execute.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
<?php
namespace ThomasHunter\SpiderMonkey;
include_once("includes/thomashunter/spidermonkey/spidermonkey.php");
include_once('includes/simple_html_dom.php');
$spider = null;
$parsed_object = array();
# Global Configuration
$config = array();
$config['url_method'] = $_POST['url_method'];
$config['give_up_404'] = $_POST['give_up_404'];
$config['simultaneous'] = $_POST['simultaneous'];
$config['delay'] = $_POST['delay'];
$config['timeout'] = isset($_POST['timeout']) ? $_POST['timeout'] : 0;
$config['user_agent'] = $_POST['user_agent'];
$config['referer'] = $_POST['referer'];
$config['storage'] = $_POST['storage'];
# Output Configuration
if ($config['storage'] == 'database') {
$config['mysql']['username'] = $_POST['mysql_username'];
$config['mysql']['password'] = $_POST['mysql_password'];
$config['mysql']['server'] = $_POST['mysql_server'];
$config['mysql']['database'] = $_POST['mysql_database'];
} else if ($config['storage'] != 'display') {
$config['filename'] = $_POST['filename'];
$config['local'] = isset($_POST['local']) ? TRUE : FALSE;
}
# Spider Method Configuration
if ($config['url_method'] == 'increment') {
$config['url_structure'] = $_POST['url_structure'];
$config['start_integer'] = $_POST['start_integer'];
$config['stop_integer'] = $_POST['stop_integer'];
$spider = New SpiderIncrement($config);
for($i = $config['start_integer']; $i <= $config['stop_integer']; $i++) {
$spider->queue_add(str_replace('#', $i, $config['url_structure']));
}
} else if ($config['url_method'] == 'crawl') {
$config['first_url'] = $_POST['first_url'];
$config['limit_domain'] = $_POST['limit_domain'];
$config['max_depth'] = $_POST['max_depth'];
$spider = New SpiderCrawl($config);
$spider->queue_add($config['first_url']);
} else {
die("Invalid URL Spidering method supplied.");
}
$capture_object = array();
if (isset($_POST['captures'])) {
$i = 0;
foreach($_POST['captures'] AS $capture) {
$capture_object[$i]['capture'] = $_POST['capture-'.$capture];
$capture_object[$i]['expression'] = $_POST['expression-'.$capture];
$capture_object[$i]['name'] = $_POST['name-'.$capture];
$i++;
}
#print_r($capture_object);
}
# Spider Execution
$documents = $spider->execute();
if (!$documents) {
die($spider->last_error);
} else {
$i = 0;
foreach($documents AS $doc) { # Loop through all the documents we downloaded
$parsed_object[$i]['url'] = $doc['url'];
$html = str_get_html($doc['doc']); # Builds the HTML document once for each page
foreach($capture_object AS $capture) { # Loop through all the captures we specified
if ($capture['capture'] == 'selector') {
$find = $html->find($capture['expression'], 0);
if ($find) {
$content = $find->innertext ? : $find->outertext;
$parsed_object[$i][$capture['name']] = $content;
}
} else if ($capture['capture'] == 'regex') {
$matches = array();
$find = preg_match('~'.$capture['expression'].'~', $doc['doc'], $matches);
if ($find) {
$parsed_object[$i][$capture['name']] = $matches[1];
}
} else if ($capture['capture'] == 'asterisk') {
$items = explode('*', $capture['expression']);
$expression = '~' . preg_quote($items[0]) . '(.*?)' . preg_quote($items[1]) . '~';
$find = preg_match($expression, $doc['doc'], $matches);
if ($find) {
$parsed_object[$i][$capture['name']] = $matches[1];
}
} else {
die("Invalid Capture Method Specified");
}
}
$html->clear();
unset($html);
$i++;
}
}
echo "<pre>", json_encode($parsed_object), "</pre>\n";;