Skip to content

Commit

Permalink
Pinning google-cloud-storage to 1.31.0 to resolve #1.
Browse files Browse the repository at this point in the history
  • Loading branch information
Robin van Schaik committed Oct 6, 2020
1 parent 9e148e8 commit c5c59f9
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 27 deletions.
5 changes: 3 additions & 2 deletions examples/training-gpu/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,9 @@ RUN apt-get update && \
&& rm -rf /var/lib/apt/lists/*

# In production you want to pin dependencies.
# Installs flair and sentence-transformers.
RUN pip install --no-cache-dir -U pip install git+https://github.com/flairNLP/flair.git sentence_transformers google-cloud-storage
# Installs flair and Google Cloud Storage.
RUN pip install --no-cache-dir -U git+https://github.com/flairNLP/flair.git \
google-cloud-storage==1.31.0

# Copies the trainer code
# Creates data folder
Expand Down
23 changes: 17 additions & 6 deletions examples/training-gpu/trainer/text-classification-training.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from os import listdir
from os.path import isfile, join
import importlib

# Import modules from flair.
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
Expand All @@ -13,6 +14,7 @@
from flair.embeddings import TransformerDocumentEmbeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings


# For interfacing with google cloud storage.
from google.cloud import storage

Expand All @@ -22,7 +24,6 @@

# Sampler helper function.
def sampler_helper(sampler):

"""Helper function from parsing the sampler arg and calling the right class."""
if sampler is None or sampler == "None":
return None
Expand Down Expand Up @@ -53,6 +54,8 @@ def get_args():
ChunkSampler,
ImbalancedClassificationDatasetSampler,
or ExpandingChunkSampler).
--use_amp: Indicates which whether Automatic Mixed Precision
should be used.
--gcs_data_path: The Google Cloud Storage (gcs) folder path containing the data.
--gcs_output_path: The Google Cloud Storage (gcs) folder path for storing the outputs.
Output:
Expand Down Expand Up @@ -111,11 +114,19 @@ def get_args():
parser.add_argument(
'--sampler',
type=str,
default=None,
default="None",
metavar='N',
help='Indicates which sampler should be used (None, ChunkSampler'
'ImbalancedClassificationDatasetSampler, ExpandingChunkSampler).')

parser.add_argument(
'--use_amp',
type=int,
default=1,
metavar='N',
help='Indicates which whether Automatic Mixed Precision should be used.'
'1 for True, 0 for False')

parser.add_argument(
'--gcs_bucket_name',
type=str,
Expand Down Expand Up @@ -168,7 +179,7 @@ def gcs_data_to_docker(gcs_bucket_name, gcs_data_path):

def initialize_training(text_column_index, label_column_index, delimiter=';',
model_type="TransformerDocumentEmbeddings", model='sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens',
max_epochs=10, patience=3, sampler=None, use_amp=True):
max_epochs=10, patience=3, sampler=None, use_amp=0):
"""
Create a text classification model using FLAIR and SentenceTransformers/Huggingface Transformers.
------------------------
Expand All @@ -192,8 +203,6 @@ def initialize_training(text_column_index, label_column_index, delimiter=';',
training.log
"""



# 1. Column format indicating which columns hold the text and label(s)
column_name_map = {text_column_index: "text",
label_column_index: "label_topic"}
Expand Down Expand Up @@ -248,6 +257,7 @@ def initialize_training(text_column_index, label_column_index, delimiter=';',
learning_rate=3e-5, # use very small learning rate
max_epochs=max_epochs,
patience=patience,
use_amp=bool(use_amp),
checkpoint=True,
sampler=sampler)

Expand Down Expand Up @@ -287,7 +297,8 @@ def main():
model=args.model,
max_epochs=args.epochs,
patience=args.patience,
sampler=sampler_helper(args.sampler))
sampler=sampler_helper(args.sampler),
use_amp=args.use_amp)

# Copy the training output from the Docker to GCS.
training_output_to_gcs(gcs_output_path=args.gcs_output_path,
Expand Down
4 changes: 3 additions & 1 deletion examples/training/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ WORKDIR /root

# # In production you want to pin dependencies.
# Installs flair and sentence-transformers.
RUN pip install -U pip install -U flair sentence_transformers google-cloud-storage
RUN pip install flair \
sentence_transformers \
google-cloud-storage==1.31.0

# Copies the trainer code
# Creates data folder
Expand Down
78 changes: 60 additions & 18 deletions examples/training/trainer/text-classification-training.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import argparse
from os import listdir
from os.path import isfile, join
import importlib

# Import modules from flair.
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
Expand All @@ -12,13 +14,25 @@
from flair.embeddings import TransformerDocumentEmbeddings
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings


# For interfacing with google cloud storage.
from google.cloud import storage

# Only log essentials.
import logging
logging.basicConfig(level=logging.ERROR)

# Sampler helper function.
def sampler_helper(sampler):
"""Helper function from parsing the sampler arg and calling the right class."""
if sampler is None or sampler == "None":
return None
else:
sampler_module = importlib.import_module('flair.samplers')
sampler_class = getattr(sampler_module, sampler)
return sampler_class


def get_args():
"""Parses the arguments as input for the training function.
Params:
Expand All @@ -36,13 +50,18 @@ def get_args():
Defaults to 10.
--patience: Indicates the number of epochs without improvement before aborting.
Defaults to 3.
--sampler: Indicates which sampler should be used (None,
ChunkSampler,
ImbalancedClassificationDatasetSampler,
or ExpandingChunkSampler).
--gcs_data_path: The Google Cloud Storage (gcs) folder path containing the data.
--gcs_output_path: The Google Cloud Storage (gcs) folder path for storing the outputs.
Output:
Dictionary of arguments.
"""

parser = argparse.ArgumentParser(description='Text Classification with Flair On GCP via Docker Container')
parser = argparse.ArgumentParser(
description='Text Classification with Flair On GCP via Docker Container.')

parser.add_argument(
'--label_column_index',
Expand Down Expand Up @@ -90,6 +109,14 @@ def get_args():
metavar='N',
help='Indicates the number of epochs without improvement before aborting.')

parser.add_argument(
'--sampler',
type=str,
default="None",
metavar='N',
help='Indicates which sampler should be used (None, ChunkSampler'
'ImbalancedClassificationDatasetSampler, ExpandingChunkSampler).')

parser.add_argument(
'--gcs_bucket_name',
type=str,
Expand Down Expand Up @@ -142,7 +169,7 @@ def gcs_data_to_docker(gcs_bucket_name, gcs_data_path):

def initialize_training(text_column_index, label_column_index, delimiter=';',
model_type="TransformerDocumentEmbeddings", model='sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens',
max_epochs=10, patience=3):
max_epochs=10, patience=3, sampler=None):
"""
Create a text classification model using FLAIR and SentenceTransformers/Huggingface Transformers.
------------------------
Expand All @@ -156,6 +183,9 @@ def initialize_training(text_column_index, label_column_index, delimiter=';',
model_type: SentenceTransformerDocumentEmbeddings or TransformerDocumentEmbeddings
model: Which model to use. Defaults to a multilingual model.
max_epochs: Number of epochs to train the model for.
patience: Number of epochs without improvement before terminating training.
sampler:
use_amp=True
------------------------
Output:
best-model.pt
Expand All @@ -164,7 +194,8 @@ def initialize_training(text_column_index, label_column_index, delimiter=';',
"""

# 1. Column format indicating which columns hold the text and label(s)
column_name_map = {text_column_index: "text", label_column_index: "label_topic"}
column_name_map = {text_column_index: "text",
label_column_index: "label_topic"}

# 2. Load corpus containing training, test and dev data and if CSV has a header, you can skip it
corpus: Corpus = CSVClassificationCorpus("./trainer/data/",
Expand All @@ -181,24 +212,29 @@ def initialize_training(text_column_index, label_column_index, delimiter=';',

# 4. Initialize the sentence_transformers model.
if model_type == "SentenceTransformerDocumentEmbeddings":
document_embeddings = SentenceTransformerDocumentEmbeddings(model)
document_embeddings = SentenceTransformerDocumentEmbeddings(model)
elif model_type == "TransformerDocumentEmbeddings":
document_embeddings = TransformerDocumentEmbeddings(model, fine_tune=True)
document_embeddings = TransformerDocumentEmbeddings(
model, fine_tune=True)
elif model_type == "WordEmbeddings":
word_embeddings = [WordEmbeddings(model)]
document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256)
document_embeddings = DocumentRNNEmbeddings(
word_embeddings, hidden_size=256)
elif model_type == "StackedEmbeddings":
document_embeddings = DocumentRNNEmbeddings([
#WordEmbeddings('nl'),
SentenceTransformerDocumentEmbeddings('distiluse-base-multilingual-cased'),
FlairEmbeddings(model + '-backward-fast'),
FlairEmbeddings(model + '-forward-fast')
])
# WordEmbeddings('nl'),
SentenceTransformerDocumentEmbeddings(
'distiluse-base-multilingual-cased'),
FlairEmbeddings(model + '-backward'),
FlairEmbeddings(model + '-forward')
])
else:
raise Exception("Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings.")
raise Exception(
"Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings.")

# 5. create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)
classifier = TextClassifier(
document_embeddings, label_dictionary=label_dict)

# 6. initialize the text classifier trainer with Adam optimizer
trainer = ModelTrainer(classifier,
Expand All @@ -211,7 +247,8 @@ def initialize_training(text_column_index, label_column_index, delimiter=';',
learning_rate=3e-5, # use very small learning rate
max_epochs=max_epochs,
patience=patience,
checkpoint=True)
checkpoint=True,
sampler=sampler)


def training_output_to_gcs(gcs_bucket_name, gcs_output_path):
Expand All @@ -224,7 +261,8 @@ def training_output_to_gcs(gcs_bucket_name, gcs_output_path):

storage_client = storage.Client()
bucket = storage_client.get_bucket(gcs_bucket_name)
files = [f for f in listdir("./trainer/checkpoint/") if isfile(join("./trainer/checkpoint/", f))]
files = [f for f in listdir("./trainer/checkpoint/")
if isfile(join("./trainer/checkpoint/", f))]
for file in files:
localFile = "./trainer/checkpoint/" + file
blob = bucket.blob(gcs_output_path + file)
Expand All @@ -237,7 +275,8 @@ def main():
args = get_args()

# Copy data from GCS to the Docker.
gcs_data_to_docker(gcs_data_path=args.gcs_data_path, gcs_bucket_name=args.gcs_bucket_name)
gcs_data_to_docker(gcs_data_path=args.gcs_data_path,
gcs_bucket_name=args.gcs_bucket_name)

# Once the data is copied to the Docker, initialize training.
initialize_training(text_column_index=args.text_column_index,
Expand All @@ -246,10 +285,13 @@ def main():
model_type=args.model_type,
model=args.model,
max_epochs=args.epochs,
patience=args.patience)
patience=args.patience,
sampler=sampler_helper(args.sampler))

# Copy the training output from the Docker to GCS.
training_output_to_gcs(gcs_output_path=args.gcs_output_path, gcs_bucket_name=args.gcs_bucket_name)
training_output_to_gcs(gcs_output_path=args.gcs_output_path,
gcs_bucket_name=args.gcs_bucket_name)


if __name__ == '__main__':
main()

0 comments on commit c5c59f9

Please sign in to comment.