-
Notifications
You must be signed in to change notification settings - Fork 134
/
ache.yml
160 lines (127 loc) · 6.1 KB
/
ache.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
##################### ACHE Configuration Example #####################
#
# Configurations for Target Storage
#
# Change to false if you don't want to store pages classified as irrelevant
target_storage.store_negative_pages: false
# Configuration for data format used to store crawled data
# Enable one of the following lines to use one of the FILESYSTEM_* data format
#target_storage.data_format.type: FILESYSTEM_HTML
#target_storage.data_format.type: FILESYSTEM_CBOR
#target_storage.data_format.type: FILESYSTEM_JSON
#
# Enable this to name files using a fixed-lenght hash instead of the percent-encoded URL
#target_storage.data_format.filesystem.hash_file_name: true
#
# Enable this to compress the file content
#target_storage.data_format.filesystem.compress_data: true
# Enable this to use the FILES data format
target_storage.data_format.type: FILES
target_storage.data_format.files.max_file_size: 134217728 # 128mb in bytes
# Enable following lines to use the WARC data format and change its default settings
#target_storage.data_format.type: WARC # enable WARC file format
#target_storage.data_format.warc.compress: true # enable GZIP compression
#target_storage.data_format.warc.max_file_size: 262144000 # maximum file size in bytes
# Enable following line to index pages in Elasticsearch
#target_storage.data_format.type: ELASTICSEARCH
#
# (Transport client ES 1.5)
#target_storage.data_format.elasticsearch.host: localhost
#target_storage.data_format.elasticsearch.port: 9300
#target_storage.data_format.elasticsearch.cluster_name: elasticsearch
#
# (REST client ES 1.x and 5.x)
#target_storage.data_format.elasticsearch.rest.hosts:
# - http://localhost:9200
#target_storage.data_format.elasticsearch.rest.connect_timeout: 30000
#target_storage.data_format.elasticsearch.rest.socket_timeout: 30000
#target_storage.data_format.elasticsearch.rest.max_retry_timeout_millis: 90000
# Instead of configuring a single data format, you can also configure multiple
# data formats in a list as follows. The settings for each data format should
# be configured independently, as if you were configuring a single data format.
# In the following config, data will be pushed to both FILES and ELASTICSEARCH.
#
#target_storage.data_formats:
# - FILES
# - ELASTICSEARCH
# Performs hard focus or soft focus. When hard focus is enabled,
# the crawler only follows links from pages classified as relevant
target_storage.hard_focus: true
# Run bipartite crawler
target_storage.bipartite: false
# Maximum number of pages to visit
target_storage.visited_page_limit: 10000000
# Store only pages that contain english text using language detector
target_storage.english_language_detection_enabled: true
#
# Configurations for Link Storage
#
# Max number of pages to be crawled from each web domain
link_storage.max_pages_per_domain: 100
# Restricts the crawler to crawl the websites provided as seeds
link_storage.link_strategy.use_scope: false
# Allows the crawler to follow forward links
link_storage.link_strategy.outlinks: true
# Gets backlinks of the pages from a search engine used by the bipartite crawling
link_storage.link_strategy.backlinks: false
# Type of link classifier used by link storage
# - LinkClassifierBaseline: random link strategy when no page classifier is provided, or Soumen's baseline strategy when a page classifier is provided
# - LinkClassifierImpl: link strategy using a link classifier
# - LinkClassifierAuthority: link strategy for the bipartite crawling
link_storage.link_classifier.type: LinkClassifierBaseline
#link_storage.link_classifier.type: LinkClassifierImpl
#link_storage.link_classifier.parameters.class_values: ["0", "1", "2"]
# Restricts crawler to follow links within a given "hops" of the seeds
#link_storage.link_classifier.type: MaxDepthLinkClassifier
#link_storage.link_classifier.max_depth: 1
# Retrain link classifiers on-the-fly
link_storage.online_learning.enabled: false
# Type of online learning (FORWARD_CLASSIFIER_BINARY,FORWARD_CLASSIFIER_BINARY)
# - FORWARD_CLASSIFIER_BINARY: pos/neg link classifier
# - FORWARD_CLASSIFIER_LEVELS: contextual graph with 3 levels
#link_storage.online_learning.type: FORWARD_CLASSIFIER_BINARY
# Learn iteration criterion (every n pages runs online learning)
#link_storage.online_learning.learning_limit: 500
# Types of LinkSelectors available:
# - TopkLinkSelector
# - RandomLinkSelector
# - NonRandomLinkSelector
# - MultiLevelLinkSelector
# - MaximizeWebsitesLinkSelector
link_storage.link_selector: TopkLinkSelector
# Enable recrawling of sitemaps using at fixed time intervals (in minutes)
#link_storage.recrawl_selector: SitemapsRecrawlSelector
#link_storage.recrawl_selector.sitemaps.interval: 1
link_storage.max_size_cache_urls: 10000
# Directory to store link storage's frontier database
link_storage.directory: "data_url/dir"
# Backlink surfer parameters
#link_storage.backsurfer.moz.access_id: mozscape-xxxxxxxxxx
#link_storage.backsurfer.moz.secret_key: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
link_storage.scheduler.host_min_access_interval: 5000
link_storage.scheduler.max_links: 10000
#
# Configurations for Crawler Manager
#
crawler_manager.downloader.user_agent.name: ACHE
crawler_manager.downloader.user_agent.url: https://github.com/ViDA-NYU/ache
#crawler_manager.downloader.user_agent.email: someone@example.com
#crawler_manager.downloader.user_agent.string: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
crawler_manager.downloader.download_thread_pool_size: 100
crawler_manager.downloader.max_retry_count: 2
crawler_manager.downloader.valid_mime_types:
- text/xml
- text/html
- text/plain
- application/x-asp
- application/xhtml+xml
- application/vnd.wap.xhtml+xml
# Use OkHttpFetcher instead of SimpleHttpFetcher
crawler_manager.downloader.use_okhttp3_fetcher: true
# okhttp3 proxy Configuration
crawler_manager.downloader.okhttp3.proxy_host: null
crawler_manager.downloader.okhttp3.proxy_username: null
crawler_manager.downloader.okhttp3.proxy_password: null
crawler_manager.downloader.okhttp3.proxy_port: 8080
# Discovery of new links using sitemap.xml protocol
link_storage.download_sitemap_xml: false