-
Notifications
You must be signed in to change notification settings - Fork 262
/
Copy pathcrawler-default.yaml
322 lines (270 loc) · 12.1 KB
/
crawler-default.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Default configuration for Apache StormCrawler
# This is used to make the default values explicit and list the most common configurations.
# Do not modify this file but instead provide a custom one with the parameter -conf
# when launching your extension of ConfigurableTopology.
config:
fetcher.server.delay: 1.0
# min. delay for multi-threaded queues
fetcher.server.min.delay: 0.0
fetcher.queue.mode: "byHost"
fetcher.threads.per.queue: 1
fetcher.threads.number: 10
fetcher.threads.start.delay: 10
fetcher.max.urls.in.queues: -1
fetcher.max.queue.size: -1
fetcher.timeout.queue: -1
# max. crawl-delay accepted in robots.txt (in seconds)
fetcher.max.crawl.delay: 30
# behavior of fetcher when the crawl-delay in the robots.txt
# is larger than fetcher.max.crawl.delay:
# (if false)
# skip URLs from this queue to avoid that any overlong
# crawl-delay throttles the crawler
# (if true)
# set the delay to fetcher.max.crawl.delay,
# making fetcher more aggressive than requested
fetcher.max.crawl.delay.force: false
# behavior of fetcher when the crawl-delay in the robots.txt
# is smaller (ev. less than one second) than the default delay:
# (if true)
# use the larger default delay (fetcher.server.delay)
# and ignore the shorter crawl-delay in the robots.txt
# (if false)
# use the delay specified in the robots.txt
fetcher.server.delay.force: false
# time bucket to use for the metrics sent by the Fetcher
fetcher.metrics.time.bucket.secs: 10
# SimpleFetcherBolt: if the delay required by the politeness
# is above this value, the tuple is sent back to the Storm queue
# for the bolt on the _throttle_ stream (in msec)
fetcher.max.throttle.sleep: -1
# alternative values are "byIP" and "byDomain"
partition.url.mode: "byHost"
urlbuffer.class: "org.apache.stormcrawler.persistence.urlbuffer.SimpleURLBuffer"
# Lists the metadata to transfer to outlinks
# Used by Fetcher and SiteMapParser for redirections,
# discovered links, passing cookies to child pages, etc.
# These are also persisted for the parent document (see below).
# Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
# metadata.transfer:
# - customMetadataName
# Lists the metadata to persist to storage
# These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
metadata.persist:
- _redirTo
- error.cause
- error.source
- isSitemap
- isFeed
metadata.track.path: true
metadata.track.depth: true
# Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
# The full user agent value sent as part of the HTTP requests
# is built from the elements below. Only the agent.name is mandatory,
# it is also used to parse the robots.txt directives.
# The agent name must be compliant with RFC 9309 (section 2.2.1)
# i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
# http.agent.name: "AnonymousCoward"
# version of your crawler
# http.agent.version: "1.0"
# description of what it does
# http.agent.description: "built with StormCrawler"
# URL webmasters can go to to learn about it
# http.agent.url: "http://someorganization.com/"
# Finally, an email so that they can get in touch with you
# http.agent.email: "someone@someorganization.com"
# user-agent name(s), used to select rules from the
# robots.txt file by matching the names against the user-agent
# lines in the robots.txt file. Optional, if empty, the value
# of http.agent.name is used. Otherwise, it must be listed first.
# the tokens must be compliant with RFC 9309 (section 2.2.1).
# http.robots.agents: agents as a comma separated string but can also take a list
# (advanced) Specify the user agent to send to the HTTP requests
# note that this is not used for parsing the robots.txt and
# therefore you need to have set _http.agent.name_.
# http.agent: "Verbatim user agent"
http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
http.content.limit: -1
http.store.headers: false
http.timeout: 10000
# store partial fetches as trimmed content (some content has been fetched,
# but reading more data from socket failed, eg. because of a network timeout)
http.content.partial.as.trimmed: false
# for crawling through a proxy:
# 1-line config
# http.proxy: "http://localhost:8000"
# http.proxy.host:
# http.proxy.port:
# okhttp only, defaults to "HTTP"
# http.proxy.type: "SOCKS"
# for crawling through a proxy with Basic authentication:
# http.proxy.user:
# http.proxy.pass:
# Retry on connection failure:
http.retry.on.connection.failure: true
# Follow redirect HTTP responses:
http.allow.redirects: false
# Allow all if robots.txt cannot be parsed due to code 403 (Forbidden):
http.robots.403.allow: true
# Allow all if robots.txt cannot be parsed due to a server error (5xx):
http.robots.5xx.allow: false
# ignore directives from robots.txt files?
http.robots.file.skip: false
# ignore robots directives from the http headers?
http.robots.headers.skip: false
# ignore robots directives from the html meta?
http.robots.meta.skip: false
# should the URLs be removed when a page is marked as noFollow
robots.noFollow.strict: true
# http.content.limit when fetching the robots.txt
# (the robots.txt RFC draft requires to fetch and parse at least 500 kiB,
# see https://datatracker.ietf.org/doc/html/draft-rep-wg-topic-00#section-2.5)
# http.robots.content.limit: 524288 # 512 kiB
http.robots.content.limit: -1 # default same as http.content.limit
# Guava caches used for the robots.txt directives
robots.cache.spec: "maximumSize=10000,expireAfterWrite=6h"
robots.error.cache.spec: "maximumSize=10000,expireAfterWrite=1h"
protocols: "http,https,file"
http.protocol.implementation: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
https.protocol.implementation: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
file.protocol.implementation: "org.apache.stormcrawler.protocol.file.FileProtocol"
# number of instances for each protocol implementation
protocol.instances.num: 1
# the http/https protocol versions to use, in order of preference
# Details of the protocol negotiation between the client and
# the crawled server depend on the chosen protocol implementation.
# If no protocol versions are listed the protocol implementation
# will use its defaults.
http.protocol.versions:
# HTTP/2 over TLS (protocol negotiation via ALPN)
#- "h2"
# HTTP/1.1
#- "http/1.1"
# HTTP/1.0
#- "http/1.0"
# HTTP/2 over TCP
##- "h2c"
# connection pool configuration of OkHttp protocol
okhttp.protocol.connection.pool:
# maximum number of idle connections (in addition to active connections)
max.idle.connections: 5
# maximum keep-alive time of the connections in seconds
connection.keep.alive: 300
# See also
# https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html
# Note that OkHttp's connection pool (v4.9.1) is not optimized for fast
# look-up of connections, the pool size (idle and active connections)
# should not exceed 1000. To allow for efficient pooling in large and
# diverse crawls, it's recommended to increase also the number of protocol
# instances, see `protocol.instances.num`.
# key values obtained by the protocol can be prefixed
# to avoid accidental overwrites. Note that persisted
# or transferred protocol metadata must also be prefixed.
protocol.md.prefix: "protocol."
# navigationfilters.config.file: "navigationfilters.json"
# selenium.addresses: "http://localhost:9515"
selenium.tracing: false
# rely on selenium's default values
# set to a value >= 0 to override
selenium.timeouts:
script: -1
pageLoad: -1
implicit: -1
# selenium.capabilities:
# a browser name is required
# browserName:"chrome"
# illustrates the use of the variable for user agent
# phantomjs.page.settings.userAgent: "$userAgent"
# ChromeDriver config
# goog:chromeOptions:
# args:
# - "--headless"
# - "--disable-gpu"
# - "--mute-audio"
# no url or parsefilters by default
# parsefilters.config.file: "parsefilters.json"
# urlfilters.config.file: "urlfilters.json"
# JSoupParserBolt
jsoup.treat.non.html.as.error: true
parser.emitOutlinks: true
parser.emitOutlinks.max.per.page: -1
track.anchors: true
detect.mimetype: true
detect.charset.maxlength: 10000
textextractor.skip.after: -1
# filters URLs in sitemaps based on their modified Date (if any)
sitemap.filter.hours.since.modified: -1
# staggered scheduling of sitemaps
sitemap.schedule.delay: -1
# whether to add any sitemaps found in the robots.txt to the status stream
# used by fetcher bolts
sitemap.discovery: false
# determines what sitemap extensions to parse from the sitemap and add
# to an outlinks metadata object
sitemap.extensions:
# Illustrates enabling sitemap extension parsing
# there are 5 supported types "IMAGE", "LINKS", "MOBILE", "NEWS", and "VIDEO"
# sitemap.extensions:
# - IMAGE
# - LINKS
# - MOBILE
# - NEWS
# - VIDEO
# Default implementation of Scheduler
scheduler.class: "org.apache.stormcrawler.persistence.DefaultScheduler"
# revisit a page daily (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.default: 1440
# revisit a page with a fetch error after 2 hours (value in minutes)
# set it to -1 to never refetch a page
fetchInterval.fetch.error: 120
# never revisit a page with an error (or set a value in minutes)
fetchInterval.error: -1
# custom fetch interval to be used when a document has the key/value in its metadata
# and has been fetched successfully (value in minutes)
# fetchInterval.FETCH_ERROR.isFeed=true
# fetchInterval.isFeed=true: 10
# max number of successive fetch errors before changing status to ERROR
max.fetch.errors: 3
# Guava cache use by AbstractStatusUpdaterBolt for DISCOVERED URLs
status.updater.use.cache: true
status.updater.cache.spec: "maximumSize=10000,expireAfterAccess=1h"
# Can also take "MINUTE" or "HOUR"
status.updater.unit.round.date: "SECOND"
# configuration for the classes extending AbstractIndexerBolt
# indexer.md.filter: "someKey=aValue"
indexer.ignore.empty.fields: false
indexer.url.fieldname: "url"
indexer.text.fieldname: "content"
indexer.text.maxlength: -1
indexer.canonical.name: "canonical"
# How to convert metadata key values into fields for indexing
#
# if no alias is specified with =alias, the key value is used
# for instance below, _domain_ and _format_ will be used
# as field names, whereas _title_ will be used for _parse.title_.
# You can specify the index of the value to store from the values array
# by using the _key[index]_ format, e.g. _parse.title[0]_ would try to
# get the first value for the metadata _parse.title_ (which is the default anyway).
# Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
# index all the keys with _parse_ as a prefix. Note that in that case, you can't
# specify an alias with =, nor can you specify an index.
indexer.md.mapping:
- parse.title=title
- parse.keywords=keywords
- parse.description=description