Initial commit

xchange11 · Jun 11, 2021 · d7e92bf · d7e92bf
commit d7e92bf
Show file tree

Hide file tree

Showing 27 changed files with 1,134 additions and 0 deletions.
diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml
@@ -0,0 +1,58 @@
+# in order to activate CD to Heroku:
+# - activate the tests in GitHub CI
+# - uncomment the content of the CD paragraph (lines 57-75)
+
+name: Python package
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v1
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          make install_requirements
+      - name: Install package
+        run: |
+          make install
+      - name: Test package
+        run: |
+          make test
+      - name: Clean everything
+        run: |
+          make clean
+    strategy:
+      matrix:
+        python-version: [3.8]
+
+  # 🤖 CD paragraph
+  #
+  # uncomment the following lines to activate CD to Heroku
+  # - remove the 2 trailing characters "# ", do not change the spaces
+  #   (there should be 2 spaces before the `deploy_heroku` key)
+  # - keep in mind you also need to configure Heroku HEROKU_API_KEY and HEROKU_EMAIL in GitHub secrets
+  # - and replace REPLACE_WITH_YOUR_HEROKU_APP_NAME in this file with the name of your Heroku app
+
+  # deploy_heroku:
+  #
+  #  runs-on: ubuntu-latest
+  #
+  #  steps:
+  #    - uses: actions/checkout@v2
+  #    - uses: akhileshns/heroku-deploy@v3.0.4 # This is the action
+  #      with:
+  #        heroku_api_key: ${{secrets.HEROKU_API_KEY}}
+  #        heroku_app_name: ${{secrets.HEROKU_APP_NAME}}
+  #        heroku_email: ${{secrets.HEROKU_EMAIL}}
+  #        appdir: "api"
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+*.egg-info
+.coverage
+.ipynb_checkpoints
+.DS_Store
+raw_data/
+**/__pycache__/
+.env
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2021 Anton Bauer, Barbara Hartmann, Felipe Lopes, Andreas Tai
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include requirements.txt
diff --git a/Makefile b/Makefile
@@ -0,0 +1,138 @@
+# ----------------------------------
+#        LOOK FOR .env FILE
+# ----------------------------------
+ifneq (,$(wildcard ./.env))
+    include .env
+    export
+endif
+
+# ----------------------------------
+#          INSTALL & TEST
+# ----------------------------------
+install_requirements:
+	@pip install -r requirements.txt
+
+check_code:
+	@flake8 scripts/* bundestag/*.py
+
+black:
+	@black scripts/* bundestag/*.py
+
+test:
+	@coverage run -m pytest tests/*.py
+	@coverage report -m --omit="${VIRTUAL_ENV}/lib/python*"
+
+ftest:
+	@Write me
+
+clean:
+	@rm -f */version.txt
+	@rm -f .coverage
+	@rm -fr */__pycache__ */*.pyc __pycache__
+	@rm -fr build dist
+	@rm -fr bundestag-*.dist-info
+	@rm -fr bundestag.egg-info
+
+install:
+	@pip install . -U
+
+all: clean install test black check_code
+
+run_api:
+	uvicorn api.bundestag:app --reload
+
+count_lines:
+	@find ./ -name '*.py' -exec  wc -l {} \; | sort -n| awk \
+        '{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}'
+	@echo ''
+	@find ./scripts -name '*-*' -exec  wc -l {} \; | sort -n| awk \
+		        '{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}'
+	@echo ''
+	@find ./tests -name '*.py' -exec  wc -l {} \; | sort -n| awk \
+        '{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}'
+	@echo ''
+
+# ----------------------------------
+#      UPLOAD PACKAGE TO PYPI
+# ----------------------------------
+PYPI_USERNAME=<AUTHOR>
+build:
+	@python setup.py sdist bdist_wheel
+
+pypi_test:
+	@twine upload -r testpypi dist/* -u $(PYPI_USERNAME)
+
+pypi:
+	@twine upload dist/* -u $(PYPI_USERNAME)
+
+
+# ----------------------------------
+#      GCLOUD INTEGRATION
+# ----------------------------------
+PACKAGE_NAME=bundestag
+PYTHON_VERSION=3.7
+FRAMEWORK=scikit-learn
+RUNTIME_VERSION=1.15
+
+SPEECH_SEGMENTS_A_PATH=raw_data/speech_segments_a.csv
+SPEECH_SEGMENTS_B_PATH=raw_data/speech_segments_b.csv
+BIO_DATA_PATH=raw_data/bio_data.csv
+
+BUCKET_DATA_FOLDER=trained
+SPEECH_A_BUCKET_FILE_NAME=$(shell basename ${SPEECH_SEGMENTS_A_PATH})
+SPEECH_B_BUCKET_FILE_NAME=$(shell basename ${SPEECH_SEGMENTS_B_PATH})
+BIO_BUCKET_FILE_NAME=$(shell basename ${BIO_DATA_PATH})
+BUCKET_TRAINING_FOLDER=trainings
+FILENAME_FIRST=trainer
+FILENAME_SECOND=bundestrainer
+
+JOB_NAME=bundestag_training_pipeline_$(shell date +'%Y%m%d_%H%M%S')
+
+set_project:
+	@gcloud config set project ${GCP_PROJECT_ID}
+
+create_bucket:
+	@gsutil mb -l ${GCP_REGION} -p ${GCP_PROJECT_ID} gs://${GCP_BUCKET_NAME}
+
+upload_data:
+	@gsutil cp ${SPEECH_SEGMENTS_A_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${SPEECH_A_BUCKET_FILE_NAME}
+	@gsutil cp ${SPEECH_SEGMENTS_B_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${SPEECH_B_BUCKET_FILE_NAME}
+	@gsutil cp ${BIO_DATA_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${BIO_BUCKET_FILE_NAME}
+
+gcp_submit_first_training:
+	gcloud ai-platform jobs submit training ${JOB_NAME} \
+		--job-dir gs://${GCP_BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
+		--package-path ${PACKAGE_NAME} \
+		--module-name ${PACKAGE_NAME}.${FILENAME_FIRST} \
+		--python-version=${PYTHON_VERSION} \
+		--runtime-version=${RUNTIME_VERSION} \
+		--region ${GCP_REGION} \
+		--stream-logs
+
+gcp_submit_second_training:
+	gcloud ai-platform jobs submit training ${JOB_NAME} \
+		--job-dir gs://${GCP_BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
+		--package-path ${PACKAGE_NAME} \
+		--module-name ${PACKAGE_NAME}.${FILENAME_SECOND} \
+		--python-version=${PYTHON_VERSION} \
+		--runtime-version=${RUNTIME_VERSION} \
+		--region ${GCP_REGION} \
+		--stream-logs
+
+update_first_model:
+	@gsutil cp gs://${GCP_BUCKET_NAME}/model.joblib api/model.joblib
+
+update_second_model:
+	@gsutil cp gs://${GCP_BUCKET_NAME}/model2.tf api/model2.tf
+
+update_w2v_model:
+	@gsutil cp gs://${GCP_BUCKET_NAME}/model2.w2v api/model2.w2v
+
+update_party_mapping:
+	@gsutil cp gs://${GCP_BUCKET_NAME}/model2.pm api/model2.pm
+
+do_gcp_setup: set_project create_bucket
+
+do_gcp_first_model_training: upload_data gcp_submit_first_training
+
+do_gcp_second_model_training: upload_data gcp_submit_second_training
diff --git a/README.md b/README.md
@@ -0,0 +1,72 @@
+# Bundesterminator
+## What it is about?
+The project explores ways to predict party affiliation by text segments.
+Machine Learning and Deep Learning approaches are tested. This is the
+result of a two-week project from the 
+[Le Wagon](https://www.lewagon.com/de/berlin/data-science-course/full-time) 
+Data Science Bootcamp, Batch 606 Berlin.
+
+A demo is available at the following URL http://bundesterminator.herokuapp.com/.
+
+## Data
+For training the models the meeting minutes of the German
+Parliament was used. They are available as XML files from the
+[open data website](https://www.bundestag.de/services/opendata)
+of the German Parliament. The XML files were pre-processed
+and translated into CSV files (currently the python framework pandas
+has no XML import).
+
+## Folder Strucure
+### api
+The trained model can be exposed by a web API. It uses a lean setting based 
+on [FastAPI](https://fastapi.tiangolo.com/) and 
+[Uvicorn](https://www.uvicorn.org/). The deployment settings assume
+a deployment on Heroku.
+
+### bundestag
+The `bundestag` folder represents the `bundestag` python package.
+It contains the main files for training the models. 
+
+#### trainer.py
+Pipeline for a machine learning approach.
+
+#### bundestrainer.py
+Class to wrap functionalities to train a Deep Learning model with
+[Tensorflow Keras](https://www.tensorflow.org/guide/keras/sequential_model)
+ and a trained
+[Gensim word2vev model](https://radimrehurek.com/gensim/models/word2vec.html).
+
+#### bundes_w2v.py
+Light wrapper to the Gensim w2v module.
+
+#### data.py
+Helper function to aquire the data.
+
+#### utils.py 
+Helper function to pre-process the data.
+
+## Deployment
+Other files are added to enable deployment of the API to Heroku and 
+to have an automated workflow based on GitHub Actions.
+
+Please note that you need to set environment variables to deploy on 
+Google Cloud Platform. This needs to be done directly in `data.py`, 
+`trainer.py` and `bundestrainer.py`. For the `MAKEFILE` environment
+variables need to be set. This will replaced by a more flexible
+approach in the future.
+
+## Licence
+[MIT](https://opensource.org/licenses/MIT)
+
+## Team
+The work is a colloborative effort of the following team members 
+who each contributed to the project:
+
+* [Anton Bauer](https://github.com/g-wagen)
+* [Barbara Hartmann](https://github.com/BabsBerlin)
+* [Felipe Lopes](https://github.com/felipebool)
+* [Andreas Tai](https://github.com/xchange11)
+
+## Thanks
+We can not thank enough the AMAAAAAZIIIING team of Le Wagon.
+The patience, expertise, and dedication opened a new world for us.
diff --git a/api/Procfile b/api/Procfile
@@ -0,0 +1 @@
+web: uvicorn bundestag:app --host=0.0.0.0 --port=${PORT:-5000}
diff --git a/api/__init__.py b/api/__init__.py
diff --git a/api/bundestag.py b/api/bundestag.py
@@ -0,0 +1,69 @@
+import random
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+import joblib
+import pandas as pd
+import os
+from bundestag.bundestrainer import Bundestrainer
+
+# The system variable "Dyno" is only present in Heroku prod env
+ON_PRODUCTION = 'DYNO' in os.environ
+
+model_dir = ""
+if ON_PRODUCTION == False:
+    model_dir = "api/"
+
+# Prepare DL model for prediction
+bundestrainer = Bundestrainer()
+bundestrainer.load_model(f'{model_dir}model2.tf')
+bundestrainer.load_w2c(f'{model_dir}model2.w2v')
+bundestrainer.load_party_mapping(f'{model_dir}party_mapping.txt')
+
+app = FastAPI()
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Allows all origins
+    allow_credentials=True,
+    allow_methods=["*"],  # Allows all methods
+    allow_headers=["*"],  # Allows all headers
+)
+
+@app.get("/predict")
+def predict(speech_fragment, model=1):  # 1
+    X = pd.Series([speech_fragment])
+
+    pred = ""
+
+    # Baseline Model
+    if model == "1":
+        # pipeline = get_model_from_gcp()
+        pipeline = joblib.load(f'{model_dir}model.joblib')
+
+        # make prediction
+        results = pipeline.predict(X)
+
+        # convert response from numpy to python type
+        pred = results[0]
+
+    # Random Generator
+    elif model == "2":
+        parties = [
+                'CDU',
+                'CSU',
+                'SPD',
+                'FDP',
+                'LINKE',
+                'GRÜNE',
+                'PARTEI',
+                'ÖDP',
+                'PIRATEN',
+            ]
+        pred = random.choice(parties)
+
+    elif model =="3":
+        print(type(speech_fragment))
+        pred = bundestrainer.predict_party_by_string(speech_fragment)
+
+
+    return(dict(prediction=pred))
diff --git a/api/model.joblib b/api/model.joblib
diff --git a/api/model2.joblib b/api/model2.joblib
diff --git a/api/model2.tf b/api/model2.tf
diff --git a/api/model2.w2v b/api/model2.w2v
diff --git a/api/model2.word2vec b/api/model2.word2vec