Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
xchange11 committed Jun 11, 2021
0 parents commit d7e92bf
Show file tree
Hide file tree
Showing 27 changed files with 1,134 additions and 0 deletions.
58 changes: 58 additions & 0 deletions .github/workflows/pythonpackage.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
# in order to activate CD to Heroku:
# - activate the tests in GitHub CI
# - uncomment the content of the CD paragraph (lines 57-75)

name: Python package

on:
push:
branches: [ master ]
pull_request:
branches: [ master ]

jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
make install_requirements
- name: Install package
run: |
make install
- name: Test package
run: |
make test
- name: Clean everything
run: |
make clean
strategy:
matrix:
python-version: [3.8]

# 🤖 CD paragraph
#
# uncomment the following lines to activate CD to Heroku
# - remove the 2 trailing characters "# ", do not change the spaces
# (there should be 2 spaces before the `deploy_heroku` key)
# - keep in mind you also need to configure Heroku HEROKU_API_KEY and HEROKU_EMAIL in GitHub secrets
# - and replace REPLACE_WITH_YOUR_HEROKU_APP_NAME in this file with the name of your Heroku app

# deploy_heroku:
#
# runs-on: ubuntu-latest
#
# steps:
# - uses: actions/checkout@v2
# - uses: akhileshns/heroku-deploy@v3.0.4 # This is the action
# with:
# heroku_api_key: ${{secrets.HEROKU_API_KEY}}
# heroku_app_name: ${{secrets.HEROKU_APP_NAME}}
# heroku_email: ${{secrets.HEROKU_EMAIL}}
# appdir: "api"
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
*.egg-info
.coverage
.ipynb_checkpoints
.DS_Store
raw_data/
**/__pycache__/
.env
21 changes: 21 additions & 0 deletions LICENSE.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2021 Anton Bauer, Barbara Hartmann, Felipe Lopes, Andreas Tai

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include requirements.txt
138 changes: 138 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# ----------------------------------
# LOOK FOR .env FILE
# ----------------------------------
ifneq (,$(wildcard ./.env))
include .env
export
endif

# ----------------------------------
# INSTALL & TEST
# ----------------------------------
install_requirements:
@pip install -r requirements.txt

check_code:
@flake8 scripts/* bundestag/*.py

black:
@black scripts/* bundestag/*.py

test:
@coverage run -m pytest tests/*.py
@coverage report -m --omit="${VIRTUAL_ENV}/lib/python*"

ftest:
@Write me

clean:
@rm -f */version.txt
@rm -f .coverage
@rm -fr */__pycache__ */*.pyc __pycache__
@rm -fr build dist
@rm -fr bundestag-*.dist-info
@rm -fr bundestag.egg-info

install:
@pip install . -U

all: clean install test black check_code

run_api:
uvicorn api.bundestag:app --reload

count_lines:
@find ./ -name '*.py' -exec wc -l {} \; | sort -n| awk \
'{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}'
@echo ''
@find ./scripts -name '*-*' -exec wc -l {} \; | sort -n| awk \
'{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}'
@echo ''
@find ./tests -name '*.py' -exec wc -l {} \; | sort -n| awk \
'{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}'
@echo ''

# ----------------------------------
# UPLOAD PACKAGE TO PYPI
# ----------------------------------
PYPI_USERNAME=<AUTHOR>
build:
@python setup.py sdist bdist_wheel

pypi_test:
@twine upload -r testpypi dist/* -u $(PYPI_USERNAME)

pypi:
@twine upload dist/* -u $(PYPI_USERNAME)


# ----------------------------------
# GCLOUD INTEGRATION
# ----------------------------------
PACKAGE_NAME=bundestag
PYTHON_VERSION=3.7
FRAMEWORK=scikit-learn
RUNTIME_VERSION=1.15

SPEECH_SEGMENTS_A_PATH=raw_data/speech_segments_a.csv
SPEECH_SEGMENTS_B_PATH=raw_data/speech_segments_b.csv
BIO_DATA_PATH=raw_data/bio_data.csv

BUCKET_DATA_FOLDER=trained
SPEECH_A_BUCKET_FILE_NAME=$(shell basename ${SPEECH_SEGMENTS_A_PATH})
SPEECH_B_BUCKET_FILE_NAME=$(shell basename ${SPEECH_SEGMENTS_B_PATH})
BIO_BUCKET_FILE_NAME=$(shell basename ${BIO_DATA_PATH})
BUCKET_TRAINING_FOLDER=trainings
FILENAME_FIRST=trainer
FILENAME_SECOND=bundestrainer

JOB_NAME=bundestag_training_pipeline_$(shell date +'%Y%m%d_%H%M%S')

set_project:
@gcloud config set project ${GCP_PROJECT_ID}

create_bucket:
@gsutil mb -l ${GCP_REGION} -p ${GCP_PROJECT_ID} gs://${GCP_BUCKET_NAME}

upload_data:
@gsutil cp ${SPEECH_SEGMENTS_A_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${SPEECH_A_BUCKET_FILE_NAME}
@gsutil cp ${SPEECH_SEGMENTS_B_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${SPEECH_B_BUCKET_FILE_NAME}
@gsutil cp ${BIO_DATA_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${BIO_BUCKET_FILE_NAME}

gcp_submit_first_training:
gcloud ai-platform jobs submit training ${JOB_NAME} \
--job-dir gs://${GCP_BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
--package-path ${PACKAGE_NAME} \
--module-name ${PACKAGE_NAME}.${FILENAME_FIRST} \
--python-version=${PYTHON_VERSION} \
--runtime-version=${RUNTIME_VERSION} \
--region ${GCP_REGION} \
--stream-logs

gcp_submit_second_training:
gcloud ai-platform jobs submit training ${JOB_NAME} \
--job-dir gs://${GCP_BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \
--package-path ${PACKAGE_NAME} \
--module-name ${PACKAGE_NAME}.${FILENAME_SECOND} \
--python-version=${PYTHON_VERSION} \
--runtime-version=${RUNTIME_VERSION} \
--region ${GCP_REGION} \
--stream-logs

update_first_model:
@gsutil cp gs://${GCP_BUCKET_NAME}/model.joblib api/model.joblib

update_second_model:
@gsutil cp gs://${GCP_BUCKET_NAME}/model2.tf api/model2.tf

update_w2v_model:
@gsutil cp gs://${GCP_BUCKET_NAME}/model2.w2v api/model2.w2v

update_party_mapping:
@gsutil cp gs://${GCP_BUCKET_NAME}/model2.pm api/model2.pm

do_gcp_setup: set_project create_bucket

do_gcp_first_model_training: upload_data gcp_submit_first_training

do_gcp_second_model_training: upload_data gcp_submit_second_training
72 changes: 72 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Bundesterminator
## What it is about?
The project explores ways to predict party affiliation by text segments.
Machine Learning and Deep Learning approaches are tested. This is the
result of a two-week project from the
[Le Wagon](https://www.lewagon.com/de/berlin/data-science-course/full-time)
Data Science Bootcamp, Batch 606 Berlin.

A demo is available at the following URL http://bundesterminator.herokuapp.com/.

## Data
For training the models the meeting minutes of the German
Parliament was used. They are available as XML files from the
[open data website](https://www.bundestag.de/services/opendata)
of the German Parliament. The XML files were pre-processed
and translated into CSV files (currently the python framework pandas
has no XML import).

## Folder Strucure
### api
The trained model can be exposed by a web API. It uses a lean setting based
on [FastAPI](https://fastapi.tiangolo.com/) and
[Uvicorn](https://www.uvicorn.org/). The deployment settings assume
a deployment on Heroku.

### bundestag
The `bundestag` folder represents the `bundestag` python package.
It contains the main files for training the models.

#### trainer.py
Pipeline for a machine learning approach.

#### bundestrainer.py
Class to wrap functionalities to train a Deep Learning model with
[Tensorflow Keras](https://www.tensorflow.org/guide/keras/sequential_model)
and a trained
[Gensim word2vev model](https://radimrehurek.com/gensim/models/word2vec.html).

#### bundes_w2v.py
Light wrapper to the Gensim w2v module.

#### data.py
Helper function to aquire the data.

#### utils.py
Helper function to pre-process the data.

## Deployment
Other files are added to enable deployment of the API to Heroku and
to have an automated workflow based on GitHub Actions.

Please note that you need to set environment variables to deploy on
Google Cloud Platform. This needs to be done directly in `data.py`,
`trainer.py` and `bundestrainer.py`. For the `MAKEFILE` environment
variables need to be set. This will replaced by a more flexible
approach in the future.

## Licence
[MIT](https://opensource.org/licenses/MIT)

## Team
The work is a colloborative effort of the following team members
who each contributed to the project:

* [Anton Bauer](https://github.com/g-wagen)
* [Barbara Hartmann](https://github.com/BabsBerlin)
* [Felipe Lopes](https://github.com/felipebool)
* [Andreas Tai](https://github.com/xchange11)

## Thanks
We can not thank enough the AMAAAAAZIIIING team of Le Wagon.
The patience, expertise, and dedication opened a new world for us.
1 change: 1 addition & 0 deletions api/Procfile
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
web: uvicorn bundestag:app --host=0.0.0.0 --port=${PORT:-5000}
Empty file added api/__init__.py
Empty file.
69 changes: 69 additions & 0 deletions api/bundestag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
import random
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
import joblib
import pandas as pd
import os
from bundestag.bundestrainer import Bundestrainer

# The system variable "Dyno" is only present in Heroku prod env
ON_PRODUCTION = 'DYNO' in os.environ

model_dir = ""
if ON_PRODUCTION == False:
model_dir = "api/"

# Prepare DL model for prediction
bundestrainer = Bundestrainer()
bundestrainer.load_model(f'{model_dir}model2.tf')
bundestrainer.load_w2c(f'{model_dir}model2.w2v')
bundestrainer.load_party_mapping(f'{model_dir}party_mapping.txt')

app = FastAPI()

app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Allows all origins
allow_credentials=True,
allow_methods=["*"], # Allows all methods
allow_headers=["*"], # Allows all headers
)

@app.get("/predict")
def predict(speech_fragment, model=1): # 1
X = pd.Series([speech_fragment])

pred = ""

# Baseline Model
if model == "1":
# pipeline = get_model_from_gcp()
pipeline = joblib.load(f'{model_dir}model.joblib')

# make prediction
results = pipeline.predict(X)

# convert response from numpy to python type
pred = results[0]

# Random Generator
elif model == "2":
parties = [
'CDU',
'CSU',
'SPD',
'FDP',
'LINKE',
'GRÜNE',
'PARTEI',
'ÖDP',
'PIRATEN',
]
pred = random.choice(parties)

elif model =="3":
print(type(speech_fragment))
pred = bundestrainer.predict_party_by_string(speech_fragment)


return(dict(prediction=pred))
Binary file added api/model.joblib
Binary file not shown.
Binary file added api/model2.joblib
Binary file not shown.
Binary file added api/model2.tf
Binary file not shown.
Binary file added api/model2.w2v
Binary file not shown.
Binary file added api/model2.word2vec
Binary file not shown.
Loading

0 comments on commit d7e92bf

Please sign in to comment.