Skip to content

Commit

Permalink
[WIP] Add Dockerfile for gensim with wrappers (piskvorky#1368)
Browse files Browse the repository at this point in the history
* added dockerfile

* remove fasttext from pip installs

* remove syntax errors

* remove unused imports

* modified dockerfile

* add subversion, locales

* use both python2 and python3

* upgrade numpy version

* add readme with relevant commands

* add fixed versions for wrapper dependencies

* made requested changes

* update readme

* change vw pin and remove docker-yml

* change vw version and make absolute paths for wrappers

* specify original gensim repo for download

* change maintainer

* correct missing slash

* use git clone for gensim

* correct gensim folder sequences
  • Loading branch information
parulsethi authored and saparina committed Jul 9, 2017
1 parent 9f5feb5 commit 3978e5a
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 0 deletions.
145 changes: 145 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
FROM ubuntu:16.04

MAINTAINER Parul Sethi <parul1sethi@gmail.com>

ENV GENSIM_REPOSITORY https://github.com/RaRe-Technologies/gensim.git
ENV GENSIM_VERSION 2.2.0

# Installs python, pip and setup tools (with fixed versions)
RUN apt-get update \
&& apt-get install -y \
ant=1.9.6-1ubuntu1 \
cmake=3.5.1-1ubuntu3 \
default-jdk=2:1.8-56ubuntu2 \
g++=4:5.3.1-1ubuntu1 \
git=1:2.7.4-0ubuntu1 \
libboost-all-dev=1.58.0.1ubuntu1 \
libgsl-dev=2.1+dfsg-2 \
mercurial=3.7.3-1ubuntu1 \
python3=3.5.1-3 \
python3-pip=8.1.1-2ubuntu0.4 \
python3-setuptools=20.7.0-1 \
python=2.7.11-1 \
python-pip=8.1.1-2ubuntu0.4 \
python-setuptools=20.7.0-1 \
unzip=6.0-20ubuntu1 \
wget=1.17.1-1ubuntu1.1 \
subversion=1.9.3-2ubuntu1 \
locales=2.23-0ubuntu9 \
libopenblas-dev=0.2.18-1ubuntu1 \
libboost-program-options-dev=1.58.0.1ubuntu1 \
zlib1g-dev=1:1.2.8.dfsg-2ubuntu4.1

# Setup python language
RUN locale-gen en_US.UTF-8
ENV LANG en_US.UTF-8
ENV LC_CTYPE en_US.UTF-8
ENV LC_ALL en_US.UTF-8

# Upgrade pip
RUN pip2 install --upgrade pip
RUN pip3 install --upgrade pip

# Install dependencies
RUN pip2 install \
cython==0.25.2 \
jupyter==1.0.0 \
matplotlib==2.0.0 \
nltk==3.2.2 \
pandas==0.19.2 \
git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \
-r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt

RUN pip3 install \
cython==0.25.2 \
jupyter==1.0.0 \
matplotlib==2.0.0 \
nltk==3.2.2 \
pandas==0.19.2 \
git+https://github.com/mila-udem/blocks.git@7beb788f1fcfc78d56c59a5edf9b4e8d98f8d7d9 \
-r https://raw.githubusercontent.com/mila-udem/blocks/stable/requirements.txt

# avoid using old numpy version installed by blocks requirements
RUN pip2 install -U numpy
RUN pip3 install -U numpy

# Download gensim from Github
RUN git clone $GENSIM_REPOSITORY \
&& cd /gensim \
&& git checkout tags/$GENSIM_VERSION \
&& pip2 install .[test] \
&& python2 setup.py install \
&& pip3 install .[test] \
&& python3 setup.py install

# Create gensim dependencies directory
RUN mkdir /gensim/gensim_dependencies

# Set ENV variables for wrappers
ENV FT_HOME /gensim/gensim_dependencies/fastText
ENV MALLET_HOME /gensim/gensim_dependencies/mallet
ENV DTM_PATH /gensim/gensim_dependencies/dtm/dtm/main
ENV VOWPAL_WABBIT_PATH /gensim/gensim_dependencies/vowpal_wabbit/vowpalwabbit/vw

# For fixed version downloads of gensim wrappers dependencies
ENV FASTTEXT_VERSION f24a781021862f0e475a5fb9c55b7c1cec3b6e2e
ENV MORPHOLOGICALPRIORSFORWORDEMBEDDINGS_VERSION ec2e37a3bcb8bd7b56b75b043c47076bc5decf22
ENV DTM_VERSION 67139e6f526b2bc33aef56dc36176a1b8b210056
ENV MALLET_VERSION 2.0.8
ENV VOWPAL_WABBIT_VERSION 69ecc2847fa0c876c6e0557af409f386f0ced59a

# Install custom dependencies

# TODO: Install wordrank (need to install mpich/openmpi with multithreading enabled)

# Install fastText
RUN cd /gensim/gensim_dependencies \
&& git clone https://github.com/facebookresearch/fastText.git \
&& cd /gensim/gensim_dependencies/fastText \
&& git checkout $FASTTEXT_VERSION \
&& make

# Install MorphologicalPriorsForWordEmbeddings
RUN cd /gensim/gensim_dependencies \
&& git clone https://github.com/rguthrie3/MorphologicalPriorsForWordEmbeddings.git \
&& cd /gensim/gensim_dependencies/MorphologicalPriorsForWordEmbeddings \
&& git checkout $MORPHOLOGICALPRIORSFORWORDEMBEDDINGS_VERSION

# Install DTM
RUN cd /gensim/gensim_dependencies \
&& git clone https://github.com/blei-lab/dtm.git \
&& cd /gensim/gensim_dependencies/dtm/dtm \
&& git checkout $DTM_VERSION \
&& make

# Install Mallet
RUN mkdir /gensim/gensim_dependencies/mallet \
&& mkdir /gensim/gensim_dependencies/download \
&& cd /gensim/gensim_dependencies/download \
&& wget --quiet http://mallet.cs.umass.edu/dist/mallet-$MALLET_VERSION.zip \
&& unzip mallet-$MALLET_VERSION.zip \
&& mv ./mallet-$MALLET_VERSION/* /gensim/gensim_dependencies/mallet \
&& rm -rf /gensim/gensim_dependencies/download \
&& cd /gensim/gensim_dependencies/mallet \
&& ant

# Install Vowpal wabbit
RUN cd /gensim/gensim_dependencies \
&& git clone https://github.com/JohnLangford/vowpal_wabbit.git \
&& cd /gensim/gensim_dependencies/vowpal_wabbit \
&& git checkout $VOWPAL_WABBIT_VERSION \
&& make \
&& make install

# Start gensim

# Run check script
RUN python2 /gensim/docker/check_fast_version.py
RUN python3 /gensim/docker/check_fast_version.py

# Add running permission to startup script
RUN chmod +x /gensim/docker/start_jupyter_notebook.sh

# Define the starting command for this container and expose its running port
CMD sh -c '/gensim/docker/start_jupyter_notebook.sh 9000'
EXPOSE 9000
21 changes: 21 additions & 0 deletions docker/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Build gensim image

In docker directory run the following command to build the image locally:

```
docker build -t gensim .
```

# Run ipython notebook with installed gensim

Just execute:

```
docker run -p 9000:9000 gensim
```

# Run the interactive bash mode

```
docker run -it gensim /bin/bash
```
10 changes: 10 additions & 0 deletions docker/check_fast_version.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import sys

try:
from gensim.models.word2vec_inner import FAST_VERSION

print('FAST_VERSION ok ! Retrieved with value ', FAST_VERSION)
sys.exit()
except ImportError:
print('Failed... fall back to plain numpy (20-80x slower training than the above)')
sys.exit(-1)
7 changes: 7 additions & 0 deletions docker/start_jupyter_notebook.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

PORT=$1
NOTEBOOK_DIR=/gensim/docs/notebooks
DEFAULT_URL=/notebooks/gensim%20Quick%20Start.ipynb

jupyter notebook --no-browser --ip=* --port=$PORT --allow-root --notebook-dir=$NOTEBOOK_DIR --NotebookApp.token=\"\" --NotebookApp.default_url=$DEFAULT_URL

0 comments on commit 3978e5a

Please sign in to comment.