-
Notifications
You must be signed in to change notification settings - Fork 0
/
run
executable file
·52 lines (41 loc) · 1.76 KB
/
run
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/bin/bash
# -*- coding: utf-8 -*-
# vim: ts=4
###
#
# Automates the download and transliteration of europarl-v7 corpus
# from statmt.org site
#
# Useful as a reminder script of how the process was done
#
# Copyright (c) 2020 Andalugeeks
# Authors:
# - J. Félix Ontañón <felixonta@gmail.com>
echo "[*] Creating virtenv"
python3 -m venv .env
source .env/bin/activate
pip3 install -r requirements.txt
echo "[*] Downloading europarl-v7"
mkdir -p europarl
cd europarl
wget -c http://www.statmt.org/europarl/v7/es-en.tgz
echo "[*] Uncompressing europarl-v7"
tar zxvf es-en.tgz
echo "[*] Replacing some 'ı' char with 'i' because andaluh script does not support that"
sed -i "s/ı/i/g" europarl-v7.es-en.es
echo "[*] Splitting into 90M chunks (github max file size is 100M)"
split -b 90M europarl-v7.es-en.es europarl-v7.es-en.es.
echo "[1/4] Transliterating to standard (this can take some minutes)"
for i in europarl-v7.es-en.es.aa europarl-v7.es-en.es.ab europarl-v7.es-en.es.ac europarl-v7.es-en.es.ad; do andaluh -i $i > $i.and; done
echo "[2/4] Transliterating to zezeo [2/4] (this can take some minutes)"
for i in europarl-v7.es-en.es.aa europarl-v7.es-en.es.ab europarl-v7.es-en.es.ac europarl-v7.es-en.es.ad; do andaluh -e z -i $i > $i.andz; done
echo "[3/4] Transliterating to seseo (this can take some minutes)"
for i in europarl-v7.es-en.es.aa europarl-v7.es-en.es.ab europarl-v7.es-en.es.ac europarl-v7.es-en.es.ad; do andaluh -e s -i $i > $i.ands; done
echo "[4/4] Transliterating to heheo (this can take some minutes)"
for i in europarl-v7.es-en.es.aa europarl-v7.es-en.es.ab europarl-v7.es-en.es.ac europarl-v7.es-en.es.ad; do andaluh -e h -i $i > $i.andh; done
echo "[*] Finish! Exiting ..."
rm es-en.tgz
rm europarl-v7.es-en.en europarl-v7.es-en.es
deactivate
cd ..
exit