-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrrun-f-oger-batch.sh
executable file
·83 lines (70 loc) · 4.65 KB
/
rrun-f-oger-batch.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/local/bin/bash
source ./rrun.env.sh
########## CHOOSE MEDLINE OR PMCOA
# ### MEDLINE
# TEXT_PIPELINE_KEY=MEDLINE_XML_TO_TEXT
# TEXT_PIPELINE_VERSION="recent"
# SUBSET_PREFIX=PUBMED_SUB_
# MAX_SUBSET_INDEX=36
# #### PMCOA
TEXT_PIPELINE_KEY=FILTER_UNACTIONABLE_TEXT
TEXT_PIPELINE_VERSION="recent"
SUBSET_PREFIX=PMC_SUBSET_
MAX_SUBSET_INDEX=41
SCRIPT=./scripts/pipelines/concepts/run_oger.sh
AUGMENTED_TEXT_PIPELINE_KEY=DOC_TEXT_AUGMENTATION
AUGMENTED_TEXT_PIPELINE_VERSION="recent"
OVERWRITE=YES
# 0.3.0 - run on augmented doc text
OUTPUT_PIPELINE_VERSION="0.3.0"
OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION="null"
# # # use the below to run a single collection
# COLLECTION="PUBMED_SUB_0"
# $SCRIPT $CS_OGER_SERVICE_URL_PROD $CIMIN_OGER_SERVICE_URL_PROD $CIMAX_OGER_SERVICE_URL_PROD $PROJECT_ID ${COLLECTION} ${STAGE_LOCATION} ${TEMP_LOCATION} $TEXT_PIPELINE_KEY $TEXT_PIPELINE_VERSION $AUGMENTED_TEXT_PIPELINE_KEY $AUGMENTED_TEXT_PIPELINE_VERSION $OUTPUT_PIPELINE_VERSION $OVERWRITE $OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION $JAR_VERSION &> "./logs/oger-${COLLECTION}.log" &
# ---------------------------------------------
# use the below for bulk processing
#
# for INDEX in $(seq 1 4 $MAX_SUBSET_INDEX)
# do
# ind=$(($INDEX + 0))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting oger pipeline... ${ind} $(date)"
# COLLECTION="${SUBSET_PREFIX}${ind}"
# $SCRIPT $CS_OGER_SERVICE_URL_PROD $CIMIN_OGER_SERVICE_URL_PROD $CIMAX_OGER_SERVICE_URL_PROD $PROJECT_ID ${COLLECTION} ${STAGE_LOCATION} ${TEMP_LOCATION} $TEXT_PIPELINE_KEY $TEXT_PIPELINE_VERSION $AUGMENTED_TEXT_PIPELINE_KEY $AUGMENTED_TEXT_PIPELINE_VERSION $OUTPUT_PIPELINE_VERSION $OVERWRITE $OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION $JAR_VERSION &> "./logs/oger-${COLLECTION}.log" &
# sleep 120
# fi
# ind=$(($INDEX + 1))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting oger pipeline... ${ind} $(date)"
# COLLECTION="${SUBSET_PREFIX}${ind}"
# $SCRIPT $CS_OGER_SERVICE_URL_PROD $CIMIN_OGER_SERVICE_URL_PROD $CIMAX_OGER_SERVICE_URL_PROD $PROJECT_ID ${COLLECTION} ${STAGE_LOCATION} ${TEMP_LOCATION} $TEXT_PIPELINE_KEY $TEXT_PIPELINE_VERSION $AUGMENTED_TEXT_PIPELINE_KEY $AUGMENTED_TEXT_PIPELINE_VERSION $OUTPUT_PIPELINE_VERSION $OVERWRITE $OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION $JAR_VERSION &> "./logs/oger-${COLLECTION}.log" &
# sleep 120
# fi
# ind=$(($INDEX + 2))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting oger pipeline... ${ind} $(date)"
# COLLECTION="${SUBSET_PREFIX}${ind}"
# $SCRIPT $CS_OGER_SERVICE_URL_PROD $CIMIN_OGER_SERVICE_URL_PROD $CIMAX_OGER_SERVICE_URL_PROD $PROJECT_ID ${COLLECTION} ${STAGE_LOCATION} ${TEMP_LOCATION} $TEXT_PIPELINE_KEY $TEXT_PIPELINE_VERSION $AUGMENTED_TEXT_PIPELINE_KEY $AUGMENTED_TEXT_PIPELINE_VERSION $OUTPUT_PIPELINE_VERSION $OVERWRITE $OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION $JAR_VERSION &> "./logs/oger-${COLLECTION}.log" &
# sleep 120
# fi
# ind=$(($INDEX + 3))
# if (( ind <= $MAX_SUBSET_INDEX)); then
# echo "Starting oger pipeline... ${ind} $(date)"
# COLLECTION="${SUBSET_PREFIX}${ind}"
# $SCRIPT $CS_OGER_SERVICE_URL_PROD $CIMIN_OGER_SERVICE_URL_PROD $CIMAX_OGER_SERVICE_URL_PROD $PROJECT_ID ${COLLECTION} ${STAGE_LOCATION} ${TEMP_LOCATION} $TEXT_PIPELINE_KEY $TEXT_PIPELINE_VERSION $AUGMENTED_TEXT_PIPELINE_KEY $AUGMENTED_TEXT_PIPELINE_VERSION $OUTPUT_PIPELINE_VERSION $OVERWRITE $OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION $JAR_VERSION &> "./logs/oger-${COLLECTION}.log" &
# fi
# wait
# done
# ---------------------------------------------
# use the below to do a redo-oger run where the status entities have been
# tagged by the collection-assignment pipeline. In this case, the status
# entities will belong to a specific collection, e.g., REDO_OGER_20230915,
# however the documents will not be tagged with that collection. In order to
# run the redo, we set OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION to an appropriate
# collection for the documents, e.g., PUBMED or PMCOA. This way, the
# documents are not constrained to "REDO_OGER_20230915" since that would result
# in no documents being returned. The tradeoff is that more documents will be
# initially found than needed.
OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION=PMCOA
COLLECTION="REDO_2_OGER_PMCOA_20231208"
$SCRIPT $CS_OGER_SERVICE_URL_PROD $CIMIN_OGER_SERVICE_URL_PROD $CIMAX_OGER_SERVICE_URL_PROD $PROJECT_ID ${COLLECTION} ${STAGE_LOCATION} ${TEMP_LOCATION} $TEXT_PIPELINE_KEY $TEXT_PIPELINE_VERSION $AUGMENTED_TEXT_PIPELINE_KEY $AUGMENTED_TEXT_PIPELINE_VERSION $OUTPUT_PIPELINE_VERSION $OVERWRITE $OPTIONAL_DOCUMENT_SPECIFIC_COLLECTION $JAR_VERSION &> "./logs/oger-${COLLECTION}.log" &