Skip to content

Commit

Permalink
Fix shebang lines in scripts/oldump.py and scripts/sitemaps/sitemaps.…
Browse files Browse the repository at this point in the history
…py (#6163)

* Fix shebang line of scripts/oldump.py
* oldump.sh: Leverage the shebang lines in oldump.py
* Fix shebang lines of scripts oldump.py and sitemap.py
* oldump.py: Print Python version
* oldump.sh: Use pyenv's Python 3.9.4
  • Loading branch information
cclauss authored Feb 22, 2022
1 parent 7160f0f commit 9505239
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 7 deletions.
11 changes: 8 additions & 3 deletions scripts/oldump.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
#! /usr/bin/env python
#! /usr/bin/env python3

import sys

import _init_path
from openlibrary.data import dump


if __name__ == "__main__":
import sys
print("{}: Python {}.{}.{}".format(__file__, *sys.version_info))

from openlibrary.data import dump

dump.main(sys.argv[1], sys.argv[2:])
10 changes: 6 additions & 4 deletions scripts/oldump.sh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ if [ $# -lt 1 ]; then
exit 1
fi

PATH="/home/openlibrary/.pyenv/plugins/pyenv-virtualenv/shims:/home/openlibrary/.pyenv/bin:$PATH"
PYTHON=$(pyenv which python3.9)
SCRIPTS=/openlibrary/scripts
PSQL_PARAMS=${PSQL_PARAMS:-"-h db openlibrary"}
TMPDIR=${TMPDIR:-/openlibrary/dumps}
Expand Down Expand Up @@ -83,7 +85,7 @@ ls -lhR # data.txt.gz is 29G
# generate cdump, sort and generate dump
log "generating $cdump.txt.gz -- takes approx. 500 minutes for 192,000,000+ records..."
# if $OLDUMP_TESTING has been exported then `oldump.py cdump` will only process a subset.
time $SCRIPTS/oldump.py cdump data.txt.gz $date | gzip -c > $cdump.txt.gz
time $PYTHON $SCRIPTS/oldump.py cdump data.txt.gz $date | gzip -c > $cdump.txt.gz
log "generated $cdump.txt.gz"
ls -lhR # ol_cdump_2021-11-14.txt.gz is 25G

Expand All @@ -96,15 +98,15 @@ if [[ -z $OLDUMP_TESTING ]]; then
fi

echo "generating the dump -- takes approx. 485 minutes for 173,000,000+ records..."
time gzip -cd $cdump.txt.gz | python $SCRIPTS/oldump.py sort --tmpdir $TMPDIR | python $SCRIPTS/oldump.py dump | gzip -c > $dump.txt.gz
time gzip -cd $cdump.txt.gz | $PYTHON $SCRIPTS/oldump.py sort --tmpdir $TMPDIR | $PYTHON $SCRIPTS/oldump.py dump | gzip -c > $dump.txt.gz
echo "generated $dump.txt.gz"
ls -lhR

# Remove the temp sort dir after dump generation
rm -rf $TMPDIR/oldumpsort

echo "splitting the dump: ol_dump_%s_$date.txt.gz -- takes approx. 85 minutes for 68,000,000+ records..."
time gzip -cd $dump.txt.gz | python $SCRIPTS/oldump.py split --format ol_dump_%s_$date.txt.gz
time gzip -cd $dump.txt.gz | $PYTHON $SCRIPTS/oldump.py split --format ol_dump_%s_$date.txt.gz
echo "done"

mkdir -p $dump $cdump
Expand Down Expand Up @@ -138,7 +140,7 @@ log "generating sitemaps"
rm -fr $TMPDIR/sitemaps
mkdir -p $TMPDIR/sitemaps
cd $TMPDIR/sitemaps
time python $SCRIPTS/sitemaps/sitemap.py $TMPDIR/dumps/$dump/$dump.txt.gz > sitemaps.log
time $PYTHON $SCRIPTS/sitemaps/sitemap.py $TMPDIR/dumps/$dump/$dump.txt.gz > sitemaps.log
ls -lh

MSG="$USER has completed $0 $1 $2 in $TMPDIR on ${HOSTNAME:-$HOST} at $(date)"
Expand Down
1 change: 1 addition & 0 deletions scripts/sitemaps/sitemap.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#! /usr/bin/env python3
"""Script to generate XML sitemap of openlibrary.org website.
USAGE:
Expand Down

0 comments on commit 9505239

Please sign in to comment.