forked from globalbiodata/inventory_2022
-
Notifications
You must be signed in to change notification settings - Fork 0
/
update_inventory.yml
68 lines (48 loc) · 1.44 KB
/
update_inventory.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# Environments
project_env: './env'
# Directories
## Querying
query_out_dir: 'out/new_query'
last_date_dir: 'out/last_query_date'
## Classification
classif_train_outdir: 'out/classif_train_out'
classif_out_dir: 'out/new_query/classification'
## NER
ner_train_outdir: 'out/ner_train_out'
ner_out_dir: 'out/new_query/ner'
## URL Extraction
extract_url_dir: 'out/new_query/url_extraction'
## Name processing
processed_names_dir: 'out/new_query/processed_names'
## Initial deduplication
initial_dedupe_dir: 'out/new_query/initial_deduplication'
## For manual review
for_manual_review_dir: 'out/new_query/for_manual_review'
## Manually reviewed
manually_reviewed_dir: 'out/new_query/manually_reviewed'
## Processed manual review
processed_manual_review: 'out/new_query/processed_manual_review'
## URL Checking
check_url_dir: 'out/new_query/url_checking'
## Additional metadata from EuropePMC
epmc_meta_dir: 'out/new_query/epmc_meta'
## Processed country codes
processed_countries: 'out/new_query/processed_countries'
# Parameters
## Europe PMC query
query_from_date: 2022
query_to_date: 2022
query_string: 'config/query.txt'
# Previous inventory to be merged
previous_inventory: 'data/final_inventory_2022.csv'
## Filtering and marking for manual review
max_urls: 2
min_best_name_prob: 0.978
## URL checking
chunk_size: 200
num_tries: 3
backoff: 0.5
## Getting metadata from EuropePMC
epmc_chunk_size: 20
## Processng country names
country_format: 'full'