core/src/main/resources/crawler-default.yaml

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Default configuration for Apache StormCrawler
# This is used to make the default values explicit and list the most common configurations.
# Do not modify this file but instead provide a custom one with the parameter -conf
# when launching your extension of ConfigurableTopology.  

config: 
  fetcher.server.delay: 1.0
  # min. delay for multi-threaded queues
  fetcher.server.min.delay: 0.0
  fetcher.queue.mode: "byHost"
  fetcher.threads.per.queue: 1
  fetcher.threads.number: 10
  fetcher.threads.start.delay: 10
  fetcher.max.urls.in.queues: -1
  fetcher.max.queue.size: -1
  fetcher.timeout.queue: -1
  # max. crawl-delay accepted in robots.txt (in seconds)
  fetcher.max.crawl.delay: 30
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is larger than fetcher.max.crawl.delay:
  #  (if false)
  #    skip URLs from this queue to avoid that any overlong
  #    crawl-delay throttles the crawler
  #  (if true)
  #    set the delay to fetcher.max.crawl.delay,
  #    making fetcher more aggressive than requested
  fetcher.max.crawl.delay.force: false
  # behavior of fetcher when the crawl-delay in the robots.txt
  # is smaller (ev. less than one second) than the default delay:
  #  (if true)
  #    use the larger default delay (fetcher.server.delay)
  #    and ignore the shorter crawl-delay in the robots.txt
  #  (if false)
  #    use the delay specified in the robots.txt
  fetcher.server.delay.force: false

  # time bucket to use for the metrics sent by the Fetcher
  fetcher.metrics.time.bucket.secs: 10

  # SimpleFetcherBolt: if the delay required by the politeness
  # is above this value, the tuple is sent back to the Storm queue 
  # for the bolt on the _throttle_ stream (in msec)
  fetcher.max.throttle.sleep: -1
  
  # alternative values are "byIP" and "byDomain"
  partition.url.mode: "byHost"
  
  urlbuffer.class: "org.apache.stormcrawler.persistence.urlbuffer.SimpleURLBuffer"

  # Lists the metadata to transfer to outlinks
  # Used by Fetcher and SiteMapParser for redirections,
  # discovered links, passing cookies to child pages, etc.
  # These are also persisted for the parent document (see below).
  # Allows wildcards, eg. "follow.*" transfers all metadata starting with "follow.".
  # metadata.transfer:
  # - customMetadataName

  # Lists the metadata to persist to storage
  # These are not transferred to the outlinks. Also allows wildcards, eg. "follow.*".
  metadata.persist:
   - _redirTo
   - error.cause
   - error.source
   - isSitemap
   - isFeed
   
  metadata.track.path: true
  metadata.track.depth: true

  # Agent name info - given here as an example. Do not be an anonynmous coward, use your real information!
  # The full user agent value sent as part of the HTTP requests
  # is built from the elements below. Only the agent.name is mandatory,
  # it is also used to parse the robots.txt directives. 

  # The agent name must be compliant with RFC 9309 (section 2.2.1) 
  # i.e. it MUST contain only uppercase and lowercase letters ("a-z" and "A-Z), underscores ("_"), and hyphens ("-")
  # http.agent.name: "AnonymousCoward"
  # version of your crawler
  # http.agent.version: "1.0"
  # description of what it does
  # http.agent.description: "built with StormCrawler"
  # URL webmasters can go to to learn about it
  # http.agent.url: "http://someorganization.com/"
  # Finally, an email so that they can get in touch with you
  # http.agent.email: "someone@someorganization.com"

  # user-agent name(s), used to select rules from the
  # robots.txt file by matching the names against the user-agent
  # lines in the robots.txt file. Optional, if empty, the value
  # of http.agent.name is used. Otherwise, it must be listed first.
  # the tokens must be compliant with RFC 9309 (section 2.2.1).
  # http.robots.agents: agents as a comma separated string but can also take a list

  # (advanced) Specify the user agent to send to the HTTP requests
  # note that this is not used for parsing the robots.txt and 
  # therefore you need to have set _http.agent.name_.
  # http.agent: "Verbatim user agent"

  http.accept.language: "en-us,en-gb,en;q=0.7,*;q=0.3"
  http.accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"
  http.content.limit: -1
  http.store.headers: false
  http.timeout: 10000

  # store partial fetches as trimmed content (some content has been fetched,
  # but reading more data from socket failed, eg. because of a network timeout)
  http.content.partial.as.trimmed: false

  # for crawling through a proxy:
  # 1-line config
  # http.proxy: "http://localhost:8000"
  # http.proxy.host:
  # http.proxy.port:
  # okhttp only, defaults to "HTTP"
  # http.proxy.type: "SOCKS"
  # for crawling through a proxy with Basic authentication:
  # http.proxy.user:
  # http.proxy.pass:

  # Retry on connection failure:
  http.retry.on.connection.failure: true

  # Follow redirect HTTP responses:
  http.allow.redirects: false

  # Allow all if robots.txt cannot be parsed due to code 403 (Forbidden):
  http.robots.403.allow: true

  # Allow all if robots.txt cannot be parsed due to a server error (5xx):
  http.robots.5xx.allow: false
  
  # ignore directives from robots.txt files?
  http.robots.file.skip: false
  
  # ignore robots directives from the http headers?
  http.robots.headers.skip: false
  
  # ignore robots directives from the html meta?
  http.robots.meta.skip: false

  # should the URLs be removed when a page is marked as noFollow
  robots.noFollow.strict: true

  # http.content.limit when fetching the robots.txt
  # (the robots.txt RFC draft requires to fetch and parse at least 500 kiB,
  #  see https://datatracker.ietf.org/doc/html/draft-rep-wg-topic-00#section-2.5)
  # http.robots.content.limit: 524288  # 512 kiB
  http.robots.content.limit: -1  # default same as http.content.limit

  # Guava caches used for the robots.txt directives 
  robots.cache.spec: "maximumSize=10000,expireAfterWrite=6h"
  robots.error.cache.spec: "maximumSize=10000,expireAfterWrite=1h"

  protocols: "http,https,file"
  http.protocol.implementation: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
  https.protocol.implementation: "org.apache.stormcrawler.protocol.httpclient.HttpProtocol"
  file.protocol.implementation: "org.apache.stormcrawler.protocol.file.FileProtocol"

  # number of instances for each protocol implementation
  protocol.instances.num: 1

  # the http/https protocol versions to use, in order of preference
  # Details of the protocol negotiation between the client and
  # the crawled server depend on the chosen protocol implementation.
  # If no protocol versions are listed the protocol implementation
  # will use its defaults.
  http.protocol.versions:
  # HTTP/2 over TLS (protocol negotiation via ALPN)
  #- "h2"
  # HTTP/1.1
  #- "http/1.1"
  # HTTP/1.0
  #- "http/1.0"
  # HTTP/2 over TCP
  ##- "h2c"

  # connection pool configuration of OkHttp protocol
  okhttp.protocol.connection.pool:
    # maximum number of idle connections (in addition to active connections)
    max.idle.connections: 5
    # maximum keep-alive time of the connections in seconds
    connection.keep.alive: 300
  # See also
  #   https://square.github.io/okhttp/3.x/okhttp/okhttp3/ConnectionPool.html
  # Note that OkHttp's connection pool (v4.9.1) is not optimized for fast
  # look-up of connections, the pool size (idle and active connections)
  # should not exceed 1000. To allow for efficient pooling in large and
  # diverse crawls, it's recommended to increase also the number of protocol
  # instances, see `protocol.instances.num`.

  # key values obtained by the protocol can be prefixed
  # to avoid accidental overwrites. Note that persisted
  # or transferred protocol metadata must also be prefixed.
  protocol.md.prefix: "protocol."

  # navigationfilters.config.file: "navigationfilters.json"
  # selenium.addresses: "http://localhost:9515"

  selenium.tracing: false

  # rely on selenium's default values
  # set to a value >= 0 to override
  selenium.timeouts:
    script: -1
    pageLoad: -1
    implicit: -1

  # selenium.capabilities:
    # a browser name is required
    # browserName:"chrome"
    # illustrates the use of the variable for user agent
    # phantomjs.page.settings.userAgent: "$userAgent"
    # ChromeDriver config
    # goog:chromeOptions:
    #   args: 
    #      - "--headless"
    #      - "--disable-gpu"
    #      - "--mute-audio"

  # no url or parsefilters by default
  # parsefilters.config.file: "parsefilters.json"
  # urlfilters.config.file: "urlfilters.json"

  # JSoupParserBolt
  jsoup.treat.non.html.as.error: true
  parser.emitOutlinks: true
  parser.emitOutlinks.max.per.page: -1
  track.anchors: true
  detect.mimetype: true
  detect.charset.maxlength: 10000

  textextractor.skip.after: -1

  # filters URLs in sitemaps based on their modified Date (if any)
  sitemap.filter.hours.since.modified: -1

  # staggered scheduling of sitemaps
  sitemap.schedule.delay: -1

  # whether to add any sitemaps found in the robots.txt to the status stream
  # used by fetcher bolts
  sitemap.discovery: false

  # determines what sitemap extensions to parse from the sitemap and add
  # to an outlinks metadata object
  sitemap.extensions:
  # Illustrates enabling sitemap extension parsing
  # there are 5 supported types "IMAGE", "LINKS", "MOBILE", "NEWS", and "VIDEO"
  # sitemap.extensions:
  #   - IMAGE
  #   - LINKS
  #   - MOBILE
  #   - NEWS
  #   - VIDEO

  # Default implementation of Scheduler
  scheduler.class: "org.apache.stormcrawler.persistence.DefaultScheduler"

  # revisit a page daily (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.default: 1440

  # revisit a page with a fetch error after 2 hours (value in minutes)
  # set it to -1 to never refetch a page
  fetchInterval.fetch.error: 120

  # never revisit a page with an error (or set a value in minutes)
  fetchInterval.error: -1

  # custom fetch interval to be used when a document has the key/value in its metadata
  # and has been fetched successfully (value in minutes)
  # fetchInterval.FETCH_ERROR.isFeed=true
  # fetchInterval.isFeed=true: 10

  # max number of successive fetch errors before changing status to ERROR
  max.fetch.errors: 3

  # Guava cache use by AbstractStatusUpdaterBolt for DISCOVERED URLs
  status.updater.use.cache: true
  status.updater.cache.spec: "maximumSize=10000,expireAfterAccess=1h"

  # Can also take "MINUTE" or "HOUR"
  status.updater.unit.round.date: "SECOND"

  # configuration for the classes extending AbstractIndexerBolt
  # indexer.md.filter: "someKey=aValue"
  indexer.ignore.empty.fields: false
  indexer.url.fieldname: "url"
  indexer.text.fieldname: "content"
  indexer.text.maxlength: -1
  indexer.canonical.name: "canonical"
  # How to convert metadata key values into fields for indexing
  # 
  # if no alias is specified with =alias, the key value is used
  # for instance below, _domain_ and _format_ will be used 
  # as field names, whereas _title_ will be used for _parse.title_.
  # You can specify the index of the value to store from the values array 
  # by using the _key[index]_ format, e.g. _parse.title[0]_ would try to 
  # get the first value for the metadata _parse.title_ (which is the default anyway).
  # Finally, you can use a glob (*) to match all the keys, e.g. _parse.*_ would
  # index all the keys with _parse_ as a prefix. Note that in that case, you can't
  # specify an alias with =, nor can you specify an index.
  indexer.md.mapping:
  - parse.title=title
  - parse.keywords=keywords
  - parse.description=description