-
Notifications
You must be signed in to change notification settings - Fork 0
/
.gitlab-ci.yml
290 lines (266 loc) · 8.99 KB
/
.gitlab-ci.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
default:
image: condaforge/linux-anvil-cos7-x86_64:latest
stages:
- start
- stop
- build
- test
- deploy
# === Variables ===
variables:
PACKAGE_VERSION: 0.2.1
# === Configurations ===
.skip-custom-pipelines:
except:
variables:
- $UPDATE_TABLES
- $CREATE_BIGQUERY
.configure:
extends:
- .skip-custom-pipelines
before_script:
# Rust
- curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
- source $HOME/.cargo/env
# Conda
- |
cat <<EOF > ~/.condarc
channel_priority: strict
channels:
- conda-forge
- ostrokach-forge
- defaults
EOF
- source /opt/conda/etc/profile.d/conda.sh
- conda activate base
- conda update -yq conda
# === Build ===
build:
stage: build
extends:
- .configure
script:
- mkdir -p "${CI_PROJECT_DIR}/conda-bld"
- conda build "${CI_PROJECT_DIR}/.conda" --output-folder "${CI_PROJECT_DIR}/conda-bld"
artifacts:
paths:
- conda-bld
# === Test ===
test:
stage: test
extends:
- .configure
dependencies:
- build
script:
# Create conda environment for testing
- conda create -n test -q -c file://${CI_PROJECT_DIR}/conda-bld "python=3.9" ${CI_PROJECT_NAME}
- conda activate test
# Run tests
- uniparc_xml_parser --help
# - python -m pytest -c setup.cfg --color=yes "tests/"
# Save binary for later
- mkdir package/
- cp $(which uniparc_xml_parser) package/
artifacts:
paths:
- package/
# download:
# stage: download
# script:
# - 'wget --header="JOB-TOKEN: $CI_JOB_TOKEN" ${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/generic/my_package/0.0.1/file.txt'
# === Pages ===
.pages:
stage: test
extends:
- .configure
dependencies:
- build
script:
# Install requirements
- conda create -n test -q -c file://${CI_PROJECT_DIR}/conda-bld "python=3.9"
- conda activate test
- python -m pip install -r docs/requirements.txt
# Build docs
- mkdocs build docs
dependencies:
- build
artifacts:
paths:
- public
# === Deploy ===
deploy-cargo:
stage: deploy
extends:
- .configure
script:
- cargo publish --no-verify
dependencies: []
only:
- tags
deploy-conda:
stage: deploy
extends:
- .configure
script:
- anaconda -t $ANACONDA_TOKEN upload $CI_PROJECT_DIR/conda-bld/*/*.tar.bz2 -u ostrokach-forge --no-progress
dependencies:
- build
only:
- tags
deploy-package:
stage: deploy
extends:
- .configure
script:
- >
curl --header "JOB-TOKEN: $CI_JOB_TOKEN" --upload-file $CI_PROJECT_DIR/package/uniparc_xml_parser
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/generic/${CI_PROJECT_NAME}/${PACKAGE_VERSION}/uniparc_xml_parser"
dependencies:
- test
only:
- tags
# === Run pipeline ===
.install-ssh-client:
script: &install-ssh-client
- "which ssh-agent || ( apt-get install -y -qq -o=Dpkg::Use-Pty=0 openssh-client -y )"
- eval $(ssh-agent -s)
- echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add -
- mkdir -p ~/.ssh
- chmod 700 ~/.ssh
- echo "$KNOWN_HOSTS" >> ~/.ssh/known_hosts
# Test that ssh client works
# - ssh strokach@conda-envs.proteinsolver.org "echo hello"
.install-gcloud:
script: &install-gcloud
- >
echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main"
| tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
- apt-get install -y -qq -o=Dpkg::Use-Pty=0 apt-transport-https ca-certificates gnupg
- curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
- apt-get update -y -qq -o=Dpkg::Use-Pty=0
# TODO: Remove version pin when this issue is fixed: https://github.com/googleapis/google-api-python-client/issues/1006
- apt-get install -y -qq -o=Dpkg::Use-Pty=0 "google-cloud-sdk=343.*"
- gcloud auth activate-service-account --key-file="${GCLOUD_SERVICE_ACCOUNT_FILE}"
- gcloud --quiet config set project ostrokach-data
- gcloud --quiet config set compute/zone us-central1-b
.install-conda:
script: &install-conda
- |
cat <<EOF > ~/.condarc
channel_priority: strict
channels:
- conda-forge
- ostrokach-forge
- defaults
EOF
- curl -s -L https://github.com/conda-forge/miniforge/releases/download/4.9.2-5/Mambaforge-4.9.2-5-Linux-x86_64.sh > miniconda.sh
- openssl dgst -sha256 miniconda.sh | grep 7f0ad0c2f367751f7878d25a7bc1b4aa48b8dcea864daf9bc09acb595102368b
- sh miniconda.sh -b -p /opt/conda
- source /opt/conda/etc/profile.d/conda.sh
generate-parquet-files:
image: ubuntu:20.04
tags:
- 3tb
variables:
KUBERNETES_CPU_REQUEST: "0.7"
KUBERNETES_CPU_LIMIT: "4.0"
KUBERNETES_MEMORY_REQUEST: 0.9G
KUBERNETES_MEMORY_LIMIT: 4.0G
KUBERNETES_EPHEMERAL_STORAGE_REQUEST: 2T
KUBERNETES_EPHEMERAL_STORAGE_LIMIT: 3T
before_script:
# Install global dependencies
- apt-get update -y -qq -o=Dpkg::Use-Pty=0
- apt-get install -y -qq -o=Dpkg::Use-Pty=0 curl gettext-base gzip openssl rsync
- *install-ssh-client
- *install-conda
script:
- conda activate base
- mamba install 'python=3.9' pyarrow uniparc_xml_parser
# Download input data
- (while true ; do sleep 120; ls -lSh uniparc_all.xml.gz ; done) &
- monitor_pid=$!
- curl -O -C - --retry 999 --retry-max-time 0 --no-progress-meter ftp://ftp.uniprot.org/pub/databases/uniprot/current_release/uniparc/uniparc_all.xml.gz
- kill $monitor_pid
# Process data
- mkdir uniparc && cd uniparc
- zcat ../uniparc_all.xml.gz | uniparc_xml_parser
# Convert to Parquet files
- if [[ -d /share/data/uniparc ]] ;
then
OUTPUT_DIR=/share/data/uniparc ;
else
OUTPUT_DIR=$(pwd) ;
fi
- python ../scripts/csv_to_parquet.py -q -f uniparc.tsv -o "${OUTPUT_DIR}/uniparc.parquet" -c uniparc_id,sequence,sequence_length,sequence_checksum
- python ../scripts/csv_to_parquet.py -q -f domain.tsv -o "${OUTPUT_DIR}/domain.parquet" -c uniparc_id,database,database_id,interpro_name,interpro_id,domain_start,domain_end
- python ../scripts/csv_to_parquet.py -q -f xref.tsv -o "${OUTPUT_DIR}/xref.parquet" -c uniparc_id,xref_id,db_type,db_id,version_i,active,version,created,last
- for property in component gene_name ncbi_gi ncbi_taxonomy_id pdb_chain protein_name proteome_id uniprot_kb_accession ; do
echo ${property} ;
python ../scripts/csv_to_parquet.py -q -f ${property}.tsv -o "${OUTPUT_DIR}/${property}.parquet" -c uniparc_id,xref_id,property,value ;
done
# Upload Parquet files to our server
- if [[ ! -d /share/data/uniparc ]] ;
then
rsync -rpv --chmod=ug=rwX,o=rX *.parquet strokach@conda-envs.proteinsolver.org:/share/data/uniparc/ ;
fi
timeout: 3 days
only:
variables:
- $UPDATE_TABLES
create-bigquery:
image: ubuntu:20.04
stage: start
tags:
- 3tb
before_script:
# Install global dependencies
- apt-get update -y -qq -o=Dpkg::Use-Pty=0
- apt-get install -y -qq -o=Dpkg::Use-Pty=0 curl gettext-base gzip openssl rsync
- *install-ssh-client
- *install-gcloud
script:
- if [[ -d /share/data/uniparc ]] ;
then
cd /share/data/uniparc ;
else
mkdir output ;
rsync -rpv strokach@conda-envs.proteinsolver.org:/share/data/uniparc/ output/ ;
cd output ;
fi
- bq load --project_id=ostrokach-data --source_format=PARQUET --replace --clustering_fields uniparc_id ostrokach-data:uniparc.uniparc uniparc.parquet
- bq load --project_id=ostrokach-data --source_format=PARQUET --replace --clustering_fields database,uniparc_id ostrokach-data:uniparc.domain domain.parquet
- bq load --project_id=ostrokach-data --source_format=PARQUET --replace --clustering_fields db_type,uniparc_id,xref_id ostrokach-data:uniparc.xref xref.parquet
- for property in component gene_name ncbi_gi ncbi_taxonomy_id pdb_chain protein_name proteome_id uniprot_kb_accession ; do
echo ${property} ;
bq load --project_id=ostrokach-data --source_format=PARQUET --replace --clustering_fields uniparc_id,xref_id ostrokach-data:uniparc.${property} ${property}.parquet ;
done
timeout: 6 hours
only:
variables:
- $CREATE_BIGQUERY
destroy-bigquery:
image: ubuntu:20.04
stage: stop
before_script:
# Install global dependencies
- apt-get update -y -qq -o=Dpkg::Use-Pty=0
- apt-get install -y -qq -o=Dpkg::Use-Pty=0 curl gettext-base gzip openssl rsync
- *install-gcloud
script:
# Delete BigQuery tables
- bq rm -f -t ostrokach-data:uniparc.uniparc
- bq rm -f -t ostrokach-data:uniparc.domain
- bq rm -f -t ostrokach-data:uniparc.xref
- for property in component gene_name ncbi_gi ncbi_taxonomy_id pdb_chain protein_name proteome_id uniprot_kb_accession ; do
echo ${property} ;
bq rm -f -t ostrokach-data:uniparc.${property} ;
done
dependencies:
- create-bigquery
when: delayed
start_in: 2 days
only:
variables:
- $CREATE_BIGQUERY