-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit d7e92bf
Showing
27 changed files
with
1,134 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
# in order to activate CD to Heroku: | ||
# - activate the tests in GitHub CI | ||
# - uncomment the content of the CD paragraph (lines 57-75) | ||
|
||
name: Python package | ||
|
||
on: | ||
push: | ||
branches: [ master ] | ||
pull_request: | ||
branches: [ master ] | ||
|
||
jobs: | ||
build: | ||
runs-on: ubuntu-latest | ||
steps: | ||
- uses: actions/checkout@v2 | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v1 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies | ||
run: | | ||
python -m pip install --upgrade pip | ||
make install_requirements | ||
- name: Install package | ||
run: | | ||
make install | ||
- name: Test package | ||
run: | | ||
make test | ||
- name: Clean everything | ||
run: | | ||
make clean | ||
strategy: | ||
matrix: | ||
python-version: [3.8] | ||
|
||
# 🤖 CD paragraph | ||
# | ||
# uncomment the following lines to activate CD to Heroku | ||
# - remove the 2 trailing characters "# ", do not change the spaces | ||
# (there should be 2 spaces before the `deploy_heroku` key) | ||
# - keep in mind you also need to configure Heroku HEROKU_API_KEY and HEROKU_EMAIL in GitHub secrets | ||
# - and replace REPLACE_WITH_YOUR_HEROKU_APP_NAME in this file with the name of your Heroku app | ||
|
||
# deploy_heroku: | ||
# | ||
# runs-on: ubuntu-latest | ||
# | ||
# steps: | ||
# - uses: actions/checkout@v2 | ||
# - uses: akhileshns/heroku-deploy@v3.0.4 # This is the action | ||
# with: | ||
# heroku_api_key: ${{secrets.HEROKU_API_KEY}} | ||
# heroku_app_name: ${{secrets.HEROKU_APP_NAME}} | ||
# heroku_email: ${{secrets.HEROKU_EMAIL}} | ||
# appdir: "api" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
*.egg-info | ||
.coverage | ||
.ipynb_checkpoints | ||
.DS_Store | ||
raw_data/ | ||
**/__pycache__/ | ||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
MIT License | ||
|
||
Copyright (c) 2021 Anton Bauer, Barbara Hartmann, Felipe Lopes, Andreas Tai | ||
|
||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
|
||
The above copyright notice and this permission notice shall be included in all | ||
copies or substantial portions of the Software. | ||
|
||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||
SOFTWARE. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
include requirements.txt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
# ---------------------------------- | ||
# LOOK FOR .env FILE | ||
# ---------------------------------- | ||
ifneq (,$(wildcard ./.env)) | ||
include .env | ||
export | ||
endif | ||
|
||
# ---------------------------------- | ||
# INSTALL & TEST | ||
# ---------------------------------- | ||
install_requirements: | ||
@pip install -r requirements.txt | ||
|
||
check_code: | ||
@flake8 scripts/* bundestag/*.py | ||
|
||
black: | ||
@black scripts/* bundestag/*.py | ||
|
||
test: | ||
@coverage run -m pytest tests/*.py | ||
@coverage report -m --omit="${VIRTUAL_ENV}/lib/python*" | ||
|
||
ftest: | ||
@Write me | ||
|
||
clean: | ||
@rm -f */version.txt | ||
@rm -f .coverage | ||
@rm -fr */__pycache__ */*.pyc __pycache__ | ||
@rm -fr build dist | ||
@rm -fr bundestag-*.dist-info | ||
@rm -fr bundestag.egg-info | ||
|
||
install: | ||
@pip install . -U | ||
|
||
all: clean install test black check_code | ||
|
||
run_api: | ||
uvicorn api.bundestag:app --reload | ||
|
||
count_lines: | ||
@find ./ -name '*.py' -exec wc -l {} \; | sort -n| awk \ | ||
'{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}' | ||
@echo '' | ||
@find ./scripts -name '*-*' -exec wc -l {} \; | sort -n| awk \ | ||
'{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}' | ||
@echo '' | ||
@find ./tests -name '*.py' -exec wc -l {} \; | sort -n| awk \ | ||
'{printf "%4s %s\n", $$1, $$2}{s+=$$0}END{print s}' | ||
@echo '' | ||
|
||
# ---------------------------------- | ||
# UPLOAD PACKAGE TO PYPI | ||
# ---------------------------------- | ||
PYPI_USERNAME=<AUTHOR> | ||
build: | ||
@python setup.py sdist bdist_wheel | ||
|
||
pypi_test: | ||
@twine upload -r testpypi dist/* -u $(PYPI_USERNAME) | ||
|
||
pypi: | ||
@twine upload dist/* -u $(PYPI_USERNAME) | ||
|
||
|
||
# ---------------------------------- | ||
# GCLOUD INTEGRATION | ||
# ---------------------------------- | ||
PACKAGE_NAME=bundestag | ||
PYTHON_VERSION=3.7 | ||
FRAMEWORK=scikit-learn | ||
RUNTIME_VERSION=1.15 | ||
|
||
SPEECH_SEGMENTS_A_PATH=raw_data/speech_segments_a.csv | ||
SPEECH_SEGMENTS_B_PATH=raw_data/speech_segments_b.csv | ||
BIO_DATA_PATH=raw_data/bio_data.csv | ||
|
||
BUCKET_DATA_FOLDER=trained | ||
SPEECH_A_BUCKET_FILE_NAME=$(shell basename ${SPEECH_SEGMENTS_A_PATH}) | ||
SPEECH_B_BUCKET_FILE_NAME=$(shell basename ${SPEECH_SEGMENTS_B_PATH}) | ||
BIO_BUCKET_FILE_NAME=$(shell basename ${BIO_DATA_PATH}) | ||
BUCKET_TRAINING_FOLDER=trainings | ||
FILENAME_FIRST=trainer | ||
FILENAME_SECOND=bundestrainer | ||
|
||
JOB_NAME=bundestag_training_pipeline_$(shell date +'%Y%m%d_%H%M%S') | ||
|
||
set_project: | ||
@gcloud config set project ${GCP_PROJECT_ID} | ||
|
||
create_bucket: | ||
@gsutil mb -l ${GCP_REGION} -p ${GCP_PROJECT_ID} gs://${GCP_BUCKET_NAME} | ||
|
||
upload_data: | ||
@gsutil cp ${SPEECH_SEGMENTS_A_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${SPEECH_A_BUCKET_FILE_NAME} | ||
@gsutil cp ${SPEECH_SEGMENTS_B_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${SPEECH_B_BUCKET_FILE_NAME} | ||
@gsutil cp ${BIO_DATA_PATH} gs://${GCP_BUCKET_NAME}/${BUCKET_DATA_FOLDER}/${BIO_BUCKET_FILE_NAME} | ||
|
||
gcp_submit_first_training: | ||
gcloud ai-platform jobs submit training ${JOB_NAME} \ | ||
--job-dir gs://${GCP_BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \ | ||
--package-path ${PACKAGE_NAME} \ | ||
--module-name ${PACKAGE_NAME}.${FILENAME_FIRST} \ | ||
--python-version=${PYTHON_VERSION} \ | ||
--runtime-version=${RUNTIME_VERSION} \ | ||
--region ${GCP_REGION} \ | ||
--stream-logs | ||
|
||
gcp_submit_second_training: | ||
gcloud ai-platform jobs submit training ${JOB_NAME} \ | ||
--job-dir gs://${GCP_BUCKET_NAME}/${BUCKET_TRAINING_FOLDER} \ | ||
--package-path ${PACKAGE_NAME} \ | ||
--module-name ${PACKAGE_NAME}.${FILENAME_SECOND} \ | ||
--python-version=${PYTHON_VERSION} \ | ||
--runtime-version=${RUNTIME_VERSION} \ | ||
--region ${GCP_REGION} \ | ||
--stream-logs | ||
|
||
update_first_model: | ||
@gsutil cp gs://${GCP_BUCKET_NAME}/model.joblib api/model.joblib | ||
|
||
update_second_model: | ||
@gsutil cp gs://${GCP_BUCKET_NAME}/model2.tf api/model2.tf | ||
|
||
update_w2v_model: | ||
@gsutil cp gs://${GCP_BUCKET_NAME}/model2.w2v api/model2.w2v | ||
|
||
update_party_mapping: | ||
@gsutil cp gs://${GCP_BUCKET_NAME}/model2.pm api/model2.pm | ||
|
||
do_gcp_setup: set_project create_bucket | ||
|
||
do_gcp_first_model_training: upload_data gcp_submit_first_training | ||
|
||
do_gcp_second_model_training: upload_data gcp_submit_second_training |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# Bundesterminator | ||
## What it is about? | ||
The project explores ways to predict party affiliation by text segments. | ||
Machine Learning and Deep Learning approaches are tested. This is the | ||
result of a two-week project from the | ||
[Le Wagon](https://www.lewagon.com/de/berlin/data-science-course/full-time) | ||
Data Science Bootcamp, Batch 606 Berlin. | ||
|
||
A demo is available at the following URL http://bundesterminator.herokuapp.com/. | ||
|
||
## Data | ||
For training the models the meeting minutes of the German | ||
Parliament was used. They are available as XML files from the | ||
[open data website](https://www.bundestag.de/services/opendata) | ||
of the German Parliament. The XML files were pre-processed | ||
and translated into CSV files (currently the python framework pandas | ||
has no XML import). | ||
|
||
## Folder Strucure | ||
### api | ||
The trained model can be exposed by a web API. It uses a lean setting based | ||
on [FastAPI](https://fastapi.tiangolo.com/) and | ||
[Uvicorn](https://www.uvicorn.org/). The deployment settings assume | ||
a deployment on Heroku. | ||
|
||
### bundestag | ||
The `bundestag` folder represents the `bundestag` python package. | ||
It contains the main files for training the models. | ||
|
||
#### trainer.py | ||
Pipeline for a machine learning approach. | ||
|
||
#### bundestrainer.py | ||
Class to wrap functionalities to train a Deep Learning model with | ||
[Tensorflow Keras](https://www.tensorflow.org/guide/keras/sequential_model) | ||
and a trained | ||
[Gensim word2vev model](https://radimrehurek.com/gensim/models/word2vec.html). | ||
|
||
#### bundes_w2v.py | ||
Light wrapper to the Gensim w2v module. | ||
|
||
#### data.py | ||
Helper function to aquire the data. | ||
|
||
#### utils.py | ||
Helper function to pre-process the data. | ||
|
||
## Deployment | ||
Other files are added to enable deployment of the API to Heroku and | ||
to have an automated workflow based on GitHub Actions. | ||
|
||
Please note that you need to set environment variables to deploy on | ||
Google Cloud Platform. This needs to be done directly in `data.py`, | ||
`trainer.py` and `bundestrainer.py`. For the `MAKEFILE` environment | ||
variables need to be set. This will replaced by a more flexible | ||
approach in the future. | ||
|
||
## Licence | ||
[MIT](https://opensource.org/licenses/MIT) | ||
|
||
## Team | ||
The work is a colloborative effort of the following team members | ||
who each contributed to the project: | ||
|
||
* [Anton Bauer](https://github.com/g-wagen) | ||
* [Barbara Hartmann](https://github.com/BabsBerlin) | ||
* [Felipe Lopes](https://github.com/felipebool) | ||
* [Andreas Tai](https://github.com/xchange11) | ||
|
||
## Thanks | ||
We can not thank enough the AMAAAAAZIIIING team of Le Wagon. | ||
The patience, expertise, and dedication opened a new world for us. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
web: uvicorn bundestag:app --host=0.0.0.0 --port=${PORT:-5000} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import random | ||
from fastapi import FastAPI | ||
from fastapi.middleware.cors import CORSMiddleware | ||
import joblib | ||
import pandas as pd | ||
import os | ||
from bundestag.bundestrainer import Bundestrainer | ||
|
||
# The system variable "Dyno" is only present in Heroku prod env | ||
ON_PRODUCTION = 'DYNO' in os.environ | ||
|
||
model_dir = "" | ||
if ON_PRODUCTION == False: | ||
model_dir = "api/" | ||
|
||
# Prepare DL model for prediction | ||
bundestrainer = Bundestrainer() | ||
bundestrainer.load_model(f'{model_dir}model2.tf') | ||
bundestrainer.load_w2c(f'{model_dir}model2.w2v') | ||
bundestrainer.load_party_mapping(f'{model_dir}party_mapping.txt') | ||
|
||
app = FastAPI() | ||
|
||
app.add_middleware( | ||
CORSMiddleware, | ||
allow_origins=["*"], # Allows all origins | ||
allow_credentials=True, | ||
allow_methods=["*"], # Allows all methods | ||
allow_headers=["*"], # Allows all headers | ||
) | ||
|
||
@app.get("/predict") | ||
def predict(speech_fragment, model=1): # 1 | ||
X = pd.Series([speech_fragment]) | ||
|
||
pred = "" | ||
|
||
# Baseline Model | ||
if model == "1": | ||
# pipeline = get_model_from_gcp() | ||
pipeline = joblib.load(f'{model_dir}model.joblib') | ||
|
||
# make prediction | ||
results = pipeline.predict(X) | ||
|
||
# convert response from numpy to python type | ||
pred = results[0] | ||
|
||
# Random Generator | ||
elif model == "2": | ||
parties = [ | ||
'CDU', | ||
'CSU', | ||
'SPD', | ||
'FDP', | ||
'LINKE', | ||
'GRÜNE', | ||
'PARTEI', | ||
'ÖDP', | ||
'PIRATEN', | ||
] | ||
pred = random.choice(parties) | ||
|
||
elif model =="3": | ||
print(type(speech_fragment)) | ||
pred = bundestrainer.predict_party_by_string(speech_fragment) | ||
|
||
|
||
return(dict(prediction=pred)) |
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Oops, something went wrong.