config/sample_config/ache.yml

##################### ACHE Configuration Example #####################

#
# Configurations for Target Storage
#

# Change to false if you don't want to store pages classified as irrelevant
target_storage.store_negative_pages: false

# Configuration for data format used to store crawled data

# Enable one of the following lines to use one of the FILESYSTEM_* data format
#target_storage.data_format.type: FILESYSTEM_HTML
#target_storage.data_format.type: FILESYSTEM_CBOR
#target_storage.data_format.type: FILESYSTEM_JSON
#
# Enable this to name files using a fixed-lenght hash instead of the percent-encoded URL
#target_storage.data_format.filesystem.hash_file_name: true
#
# Enable this to compress the file content
#target_storage.data_format.filesystem.compress_data: true

# Enable this to use the FILES data format
target_storage.data_format.type: FILES
target_storage.data_format.files.max_file_size: 134217728 # 128mb in bytes

# Enable following lines to use the WARC data format and change its default settings
#target_storage.data_format.type: WARC                    # enable WARC file format
#target_storage.data_format.warc.compress: true           # enable GZIP compression
#target_storage.data_format.warc.max_file_size: 262144000 # maximum file size in bytes

# Enable following line to index pages in Elasticsearch
#target_storage.data_format.type: ELASTICSEARCH
#
# (Transport client ES 1.5)
#target_storage.data_format.elasticsearch.host: localhost
#target_storage.data_format.elasticsearch.port: 9300
#target_storage.data_format.elasticsearch.cluster_name: elasticsearch
#
# (REST client ES 1.x and 5.x)
#target_storage.data_format.elasticsearch.rest.hosts:
#  - http://localhost:9200
#target_storage.data_format.elasticsearch.rest.connect_timeout: 30000
#target_storage.data_format.elasticsearch.rest.socket_timeout: 30000
#target_storage.data_format.elasticsearch.rest.max_retry_timeout_millis: 90000


# Instead of configuring a single data format, you can also configure multiple
# data formats in a list as follows. The settings for each data format should
# be configured independently, as if you were configuring a single data format.
# In the following config, data will be pushed to both FILES and ELASTICSEARCH.
#
#target_storage.data_formats:
#  - FILES
#  - ELASTICSEARCH


# Performs hard focus or soft focus. When hard focus is enabled,
# the crawler only follows links from pages classified as relevant
target_storage.hard_focus: true

# Run bipartite crawler
target_storage.bipartite: false

# Maximum number of pages to visit
target_storage.visited_page_limit: 10000000

# Store only pages that contain english text using language detector
target_storage.english_language_detection_enabled: true

#
# Configurations for Link Storage
#

# Max number of pages to be crawled from each web domain
link_storage.max_pages_per_domain: 100
# Restricts the crawler to crawl the websites provided as seeds
link_storage.link_strategy.use_scope: false
# Allows the crawler to follow forward links
link_storage.link_strategy.outlinks: true
# Gets backlinks of the pages from a search engine used by the bipartite crawling
link_storage.link_strategy.backlinks: false

# Type of link classifier used by link storage
# - LinkClassifierBaseline: random link strategy when no page classifier is provided, or Soumen's baseline strategy when a page classifier is provided
# - LinkClassifierImpl: link strategy using a link classifier
# - LinkClassifierAuthority: link strategy for the bipartite crawling
link_storage.link_classifier.type: LinkClassifierBaseline
#link_storage.link_classifier.type: LinkClassifierImpl
#link_storage.link_classifier.parameters.class_values: ["0", "1", "2"]

# Restricts crawler to follow links within a given "hops" of the seeds
#link_storage.link_classifier.type: MaxDepthLinkClassifier
#link_storage.link_classifier.max_depth: 1


# Retrain link classifiers on-the-fly
link_storage.online_learning.enabled: false

# Type of online learning (FORWARD_CLASSIFIER_BINARY,FORWARD_CLASSIFIER_BINARY)
# - FORWARD_CLASSIFIER_BINARY: pos/neg link classifier
# - FORWARD_CLASSIFIER_LEVELS: contextual graph with 3 levels
#link_storage.online_learning.type: FORWARD_CLASSIFIER_BINARY

# Learn iteration criterion (every n pages runs online learning)
#link_storage.online_learning.learning_limit: 500

# Types of LinkSelectors available:
# - TopkLinkSelector
# - RandomLinkSelector
# - NonRandomLinkSelector
# - MultiLevelLinkSelector
# - MaximizeWebsitesLinkSelector
link_storage.link_selector: TopkLinkSelector

# Enable recrawling of sitemaps using at fixed time intervals (in minutes)
#link_storage.recrawl_selector: SitemapsRecrawlSelector
#link_storage.recrawl_selector.sitemaps.interval: 1

link_storage.max_size_cache_urls: 10000

# Directory to store link storage's frontier database
link_storage.directory: "data_url/dir"

# Backlink surfer parameters
#link_storage.backsurfer.moz.access_id: mozscape-xxxxxxxxxx
#link_storage.backsurfer.moz.secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx

link_storage.scheduler.host_min_access_interval: 5000
link_storage.scheduler.max_links: 10000

#
# Configurations for Crawler Manager
#
crawler_manager.downloader.user_agent.name: ACHE
crawler_manager.downloader.user_agent.url: https://github.com/ViDA-NYU/ache
#crawler_manager.downloader.user_agent.email: someone@example.com
#crawler_manager.downloader.user_agent.string: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"

crawler_manager.downloader.download_thread_pool_size: 100
crawler_manager.downloader.max_retry_count: 2
crawler_manager.downloader.valid_mime_types:
 - text/xml
 - text/html
 - text/plain
 - application/x-asp
 - application/xhtml+xml
 - application/vnd.wap.xhtml+xml

# Use OkHttpFetcher instead of SimpleHttpFetcher
crawler_manager.downloader.use_okhttp3_fetcher: true

# okhttp3 proxy Configuration
crawler_manager.downloader.okhttp3.proxy_host: null
crawler_manager.downloader.okhttp3.proxy_username: null
crawler_manager.downloader.okhttp3.proxy_password: null
crawler_manager.downloader.okhttp3.proxy_port: 8080

# Discovery of new links using sitemap.xml protocol
link_storage.download_sitemap_xml: false