forked from google/robotstxt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
robots.h
241 lines (203 loc) · 9.45 KB
/
robots.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
// Copyright 1999 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// -----------------------------------------------------------------------------
// File: robots.h
// -----------------------------------------------------------------------------
//
// This file implements the standard defined by the Robots Exclusion Protocol
// (REP) internet draft (I-D).
// https://tools.ietf.org/html/draft-rep-wg-topic
//
// Google doesn't follow the standard strictly, because there are a lot of
// non-conforming robots.txt files out there, and we err on the side of
// disallowing when this seems intended.
//
// An more user-friendly description of how Google handles robots.txt can be
// found at:
// https://developers.google.com/search/reference/robots_txt
//
// This library provides a low-level parser for robots.txt (ParseRobotsTxt()),
// and a matcher for URLs against a robots.txt (class RobotsMatcher).
#ifndef THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
#define THIRD_PARTY_ROBOTSTXT_ROBOTS_H__
#include <string>
#include <vector>
#include "absl/strings/string_view.h"
namespace googlebot {
// Handler for directives found in robots.txt. These callbacks are called by
// ParseRobotsTxt() in the sequence they have been found in the file.
class RobotsParseHandler {
public:
RobotsParseHandler() {}
virtual ~RobotsParseHandler() {}
// Disallow copying and assignment.
RobotsParseHandler(const RobotsParseHandler&) = delete;
RobotsParseHandler& operator=(const RobotsParseHandler&) = delete;
virtual void HandleRobotsStart() = 0;
virtual void HandleRobotsEnd() = 0;
virtual void HandleUserAgent(int line_num, absl::string_view value) = 0;
virtual void HandleAllow(int line_num, absl::string_view value) = 0;
virtual void HandleDisallow(int line_num, absl::string_view value) = 0;
virtual void HandleSitemap(int line_num, absl::string_view value) = 0;
// Any other unrecognized name/value pairs.
virtual void HandleUnknownAction(int line_num, absl::string_view action,
absl::string_view value) = 0;
};
// Parses body of a robots.txt and emits parse callbacks. This will accept
// typical typos found in robots.txt, such as 'disalow'.
//
// Note, this function will accept all kind of input but will skip
// everything that does not look like a robots directive.
void ParseRobotsTxt(absl::string_view robots_body,
RobotsParseHandler* parse_callback);
// RobotsMatcher - matches robots.txt against URLs.
//
// The Matcher uses a default match strategy for Allow/Disallow patterns which
// is the official way of Google crawler to match robots.txt. It is also
// possible to provide a custom match strategy.
//
// The entry point for the user is to call one of the *AllowedByRobots()
// methods that return directly if a URL is being allowed according to the
// robots.txt and the crawl agent.
// The RobotsMatcher can be re-used for URLs/robots.txt but is not thread-safe.
class RobotsMatchStrategy;
class RobotsMatcher : protected RobotsParseHandler {
public:
// Create a RobotsMatcher with the default matching strategy. The default
// matching strategy is longest-match as opposed to the former internet draft
// that provisioned first-match strategy. Analysis shows that longest-match,
// while more restrictive for crawlers, is what webmasters assume when writing
// directives. For example, in case of conflicting matches (both Allow and
// Disallow), the longest match is the one the user wants. For example, in
// case of a robots.txt file that has the following rules
// Allow: /
// Disallow: /cgi-bin
// it's pretty obvious what the webmaster wants: they want to allow crawl of
// every URI except /cgi-bin. However, according to the expired internet
// standard, crawlers should be allowed to crawl everything with such a rule.
RobotsMatcher();
~RobotsMatcher() override;
// Disallow copying and assignment.
RobotsMatcher(const RobotsMatcher&) = delete;
RobotsMatcher& operator=(const RobotsMatcher&) = delete;
// Verifies that the given user agent is valid to be matched against
// robots.txt. Valid user agent strings only contain the characters
// [a-zA-Z_-].
static bool IsValidUserAgentToObey(absl::string_view user_agent);
// Returns true iff 'url' is allowed to be fetched by any member of the
// "user_agents" vector. 'url' must be %-encoded according to RFC3986.
bool AllowedByRobots(absl::string_view robots_body,
const std::vector<std::string>* user_agents,
const std::string& url);
// Do robots check for 'url' when there is only one user agent. 'url' must
// be %-encoded according to RFC3986.
bool OneAgentAllowedByRobots(absl::string_view robots_txt,
const std::string& user_agent,
const std::string& url);
// Returns true if we are disallowed from crawling a matching URI.
bool disallow() const;
// Returns true if we are disallowed from crawling a matching URI. Ignores any
// rules specified for the default user agent, and bases its results only on
// the specified user agents.
bool disallow_ignore_global() const;
// Returns true iff, when AllowedByRobots() was called, the robots file
// referred explicitly to one of the specified user agents.
bool ever_seen_specific_agent() const;
// Returns the line that matched or 0 if none matched.
const int matching_line() const;
protected:
// Parse callbacks.
// Protected because used in unittest. Never override RobotsMatcher, implement
// googlebot::RobotsParseHandler instead.
void HandleRobotsStart() override;
void HandleRobotsEnd() override {}
void HandleUserAgent(int line_num, absl::string_view value) override;
void HandleAllow(int line_num, absl::string_view value) override;
void HandleDisallow(int line_num, absl::string_view value) override;
void HandleSitemap(int line_num, absl::string_view value) override;
void HandleUnknownAction(int line_num, absl::string_view action,
absl::string_view value) override;
protected:
// Extract the matchable part of a user agent string, essentially stopping at
// the first invalid character.
// Example: 'Googlebot/2.1' becomes 'Googlebot'
static absl::string_view ExtractUserAgent(absl::string_view user_agent);
// Initialize next path and user-agents to check. Path must contain only the
// path, params, and query (if any) of the url and must start with a '/'.
void InitUserAgentsAndPath(const std::vector<std::string>* user_agents,
const char* path);
// Returns true if any user-agent was seen.
bool seen_any_agent() const {
return seen_global_agent_ || seen_specific_agent_;
}
// Instead of just maintaining a Boolean indicating whether a given line has
// matched, we maintain a count of the maximum number of characters matched by
// that pattern.
//
// This structure stores the information associated with a match (e.g. when a
// Disallow is matched) as priority of the match and line matching.
//
// The priority is inialized with a negative value to make sure that a match
// of priority 0 is higher priority than no match at all.
class Match {
private:
static const int kNoMatchPriority = -1;
public:
Match(int priority, int line) : priority_(priority), line_(line) {}
Match() : priority_(kNoMatchPriority), line_(0) {}
void Set(int priority, int line) {
priority_ = priority;
line_ = line;
}
void Clear() { Set(kNoMatchPriority, 0); }
int line() const { return line_; }
int priority() const { return priority_; }
static const Match& HigherPriorityMatch(const Match& a, const Match& b) {
if (a.priority() > b.priority()) {
return a;
} else {
return b;
}
}
private:
int priority_;
int line_;
};
// For each of the directives within user-agents, we keep global and specific
// match scores.
struct MatchHierarchy {
Match global; // Match for '*'
Match specific; // Match for queried agent.
void Clear() {
global.Clear();
specific.Clear();
}
};
MatchHierarchy allow_; // Characters of 'url' matching Allow.
MatchHierarchy disallow_; // Characters of 'url' matching Disallow.
bool seen_global_agent_; // True if processing global agent rules.
bool seen_specific_agent_; // True if processing our specific agent.
bool ever_seen_specific_agent_; // True if we ever saw a block for our agent.
bool seen_separator_; // True if saw any key: value pair.
// The path we want to pattern match. Not owned and only a valid pointer
// during the lifetime of *AllowedByRobots calls.
const char* path_;
// The User-Agents we are interested in. Not owned and only a valid
// pointer during the lifetime of *AllowedByRobots calls.
const std::vector<std::string>* user_agents_;
RobotsMatchStrategy* match_strategy_;
};
} // namespace googlebot
#endif // THIRD_PARTY_ROBOTSTXT_ROBOTS_H__