-
Notifications
You must be signed in to change notification settings - Fork 1
/
WebCrawler.cpp
133 lines (108 loc) · 2.7 KB
/
WebCrawler.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#include "provided.h"
#include <string>
#include <list>
class WebCrawlerImpl
{
public:
WebCrawlerImpl();
void addUrl(std::string url);
int getNumberOfUrls() const;
void crawl(void(*callback)(std::string url, bool success));
bool save(std::string filenameBase);
bool load(std::string filenameBase);
private:
Indexer m_webCrawlerIndex;
int m_numberOfUrls;
std::list<std::string> m_storedUrls;
};
WebCrawlerImpl::WebCrawlerImpl()
{
m_numberOfUrls = 0;
}
void WebCrawlerImpl::addUrl(std::string url)
{
// Stores URL but does not crawl. See crawl function
m_storedUrls.push_front(url); // Push front so same order when crawled
m_numberOfUrls++;
}
int WebCrawlerImpl::getNumberOfUrls() const
{
return m_numberOfUrls;
}
void WebCrawlerImpl::crawl(void(*callback)(std::string url, bool success))
{
// Step 1. Connect to website and download web page at specific URL
// Step 2. If download successful, place website in WordBag object,
// Incorporate WordBag into WebCrawler's Indexer object
// Step 3. Call a callback function provided by the user via a function
// pointer, to report the status of the web page download and
// incorporation into the index.
std::string page, url;
bool success;
while (!m_storedUrls.empty())
{
url = m_storedUrls.back();
m_storedUrls.pop_back();
// Step 1
if (HTTP().get(url, page))
{
// Step 2
WordBag wb(page);
m_webCrawlerIndex.incorporate(url, wb);
// TODO: REMOVE AFTER TESTING
/*std::string word;
int count;
bool gotAWord = wb.getFirstWord(word, count);
while (gotAWord)
{
std::cerr << "The word " << word << " occurs " << count
<< " times" << std::endl;
gotAWord = wb.getNextWord(word, count);
}*/
success = true;
}
else
success = false;
// Step 3
callback(url, success);
}
}
bool WebCrawlerImpl::save(std::string filenameBase)
{
return m_webCrawlerIndex.save(filenameBase);
}
bool WebCrawlerImpl::load(std::string filenameBase)
{
return m_webCrawlerIndex.load(filenameBase);
}
//******************** WebCrawler functions *******************************
// These functions simply delegate to WebCrawlerImpl's functions.
// You probably don't want to change any of this code.
WebCrawler::WebCrawler()
{
m_impl = new WebCrawlerImpl;
}
WebCrawler::~WebCrawler()
{
delete m_impl;
}
void WebCrawler::addUrl(std::string url)
{
m_impl->addUrl(url);
}
int WebCrawler::getNumberOfUrls() const
{
return m_impl->getNumberOfUrls();
}
void WebCrawler::crawl(void(*callback)(std::string url, bool success))
{
m_impl->crawl(callback);
}
bool WebCrawler::save(std::string filenameBase)
{
return m_impl->save(filenameBase);
}
bool WebCrawler::load(std::string filenameBase)
{
return m_impl->load(filenameBase);
}