From b60b42f89a09381b8bb579d5fc5f86f126911829 Mon Sep 17 00:00:00 2001 From: "Y.J" Date: Thu, 10 Sep 2020 15:32:24 -0400 Subject: [PATCH 1/3] Simplify boilerplate for monoT5 and monoBERT --- pygaggle/rerank/pretrained.py | 40 +++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 pygaggle/rerank/pretrained.py diff --git a/pygaggle/rerank/pretrained.py b/pygaggle/rerank/pretrained.py new file mode 100644 index 00000000..ef7bc11d --- /dev/null +++ b/pygaggle/rerank/pretrained.py @@ -0,0 +1,40 @@ +import torch + +from transformers import (AutoTokenizer, + AutoModelForSequenceClassification, + T5ForConditionalGeneration) + +from .base import Reranker +from .transformer import (SequenceClassificationTransformerReranker, + T5Reranker) +from pygaggle.model import T5BatchTokenizer + + +__all__ = ['monoT5', + 'monoBERT'] + + +def monoT5(model_name: str = 'castorini/monot5-base-msmarco', + tokenizer_name: str = 't5-base', + batch_size: int = 8) -> Reranker: + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + model = T5ForConditionalGeneration.from_pretrained(model_name) + model = model.to(device).eval() + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + tokenizer = T5BatchTokenizer(tokenizer, batch_size) + return T5Reranker(model, tokenizer) + + +def monoBERT(model_name: str = 'castorini/monobert-large-msmarco', + tokenizer_name: str = 'bert-large-uncased') -> Reranker: + + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + model = AutoModelForSequenceClassification.from_pretrained(model_name) + model = model.to(device).eval() + + tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) + return SequenceClassificationTransformerReranker(model, tokenizer) From 26de4a69cc8f417cb2bc932433efd4eb79f9ca22 Mon Sep 17 00:00:00 2001 From: "Y.J" Date: Fri, 11 Sep 2020 12:37:30 -0400 Subject: [PATCH 2/3] Fold into constructors --- README.md | 30 ++----------- pygaggle/rerank/pretrained.py | 40 ------------------ pygaggle/rerank/transformer.py | 47 ++++++++++++++------- pygaggle/run/evaluate_document_ranker.py | 8 ++-- pygaggle/run/evaluate_kaggle_highlighter.py | 8 ++-- pygaggle/run/evaluate_passage_ranker.py | 8 ++-- 6 files changed, 48 insertions(+), 93 deletions(-) delete mode 100644 pygaggle/rerank/pretrained.py diff --git a/README.md b/README.md index 8a5e0926..d404b3b9 100644 --- a/README.md +++ b/README.md @@ -35,45 +35,23 @@ Currently, this repo contains implementations of the rerankers for [CovidQA](htt Here's how to initalize the T5 reranker from [Document Ranking with a Pretrained Sequence-to-Sequence Model](https://arxiv.org/pdf/2003.06713.pdf): ```python -import torch -from transformers import AutoTokenizer, T5ForConditionalGeneration -from pygaggle.model import T5BatchTokenizer from pygaggle.rerank.base import Query, Text -from pygaggle.rerank.transformer import T5Reranker +from pygaggle.rerank.transformer import monoT5 model_name = 'castorini/monot5-base-msmarco' tokenizer_name = 't5-base' -batch_size = 8 - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -model = T5ForConditionalGeneration.from_pretrained(model_name) -model = model.to(device).eval() - -tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) -tokenizer = T5BatchTokenizer(tokenizer, batch_size) -reranker = T5Reranker(model, tokenizer) +reranker = monoT5(model_name, tokenizer_name) ``` Alternatively, here's the BERT reranker from [Passage Re-ranking with BERT](https://arxiv.org/pdf/1901.04085.pdf), which isn't as good as the T5 reranker: ```python -import torch -from transformers import AutoTokenizer, AutoModelForSequenceClassification -from pygaggle.model import BatchTokenizer from pygaggle.rerank.base import Query, Text -from pygaggle.rerank.transformer import SequenceClassificationTransformerReranker +from pygaggle.rerank.transformer import monoBERT model_name = 'castorini/monobert-large-msmarco' tokenizer_name = 'bert-large-uncased' - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - -model = AutoModelForSequenceClassification.from_pretrained(model_name) -model = model.to(device).eval() - -tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) -reranker = SequenceClassificationTransformerReranker(model, tokenizer) +reranker = monoBERT(model_name, tokenizer_name) ``` Either way, continue with a complere reranking example: diff --git a/pygaggle/rerank/pretrained.py b/pygaggle/rerank/pretrained.py deleted file mode 100644 index ef7bc11d..00000000 --- a/pygaggle/rerank/pretrained.py +++ /dev/null @@ -1,40 +0,0 @@ -import torch - -from transformers import (AutoTokenizer, - AutoModelForSequenceClassification, - T5ForConditionalGeneration) - -from .base import Reranker -from .transformer import (SequenceClassificationTransformerReranker, - T5Reranker) -from pygaggle.model import T5BatchTokenizer - - -__all__ = ['monoT5', - 'monoBERT'] - - -def monoT5(model_name: str = 'castorini/monot5-base-msmarco', - tokenizer_name: str = 't5-base', - batch_size: int = 8) -> Reranker: - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - model = T5ForConditionalGeneration.from_pretrained(model_name) - model = model.to(device).eval() - - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - tokenizer = T5BatchTokenizer(tokenizer, batch_size) - return T5Reranker(model, tokenizer) - - -def monoBERT(model_name: str = 'castorini/monobert-large-msmarco', - tokenizer_name: str = 'bert-large-uncased') -> Reranker: - - device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') - - model = AutoModelForSequenceClassification.from_pretrained(model_name) - model = model.to(device).eval() - - tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) - return SequenceClassificationTransformerReranker(model, tokenizer) diff --git a/pygaggle/rerank/transformer.py b/pygaggle/rerank/transformer.py index fec586bc..4b9ae1a0 100644 --- a/pygaggle/rerank/transformer.py +++ b/pygaggle/rerank/transformer.py @@ -1,7 +1,9 @@ from copy import deepcopy -from typing import List +from typing import List, Union -from transformers import (PreTrainedModel, +from transformers import (AutoTokenizer, + AutoModelForSequenceClassification, + PreTrainedModel, PreTrainedTokenizer, T5ForConditionalGeneration) import torch @@ -13,21 +15,29 @@ QueryDocumentBatch, QueryDocumentBatchTokenizer, SpecialTokensCleaner, + T5BatchTokenizer, greedy_decode) -__all__ = ['T5Reranker', +__all__ = ['monoT5', 'UnsupervisedTransformerReranker', - 'SequenceClassificationTransformerReranker', + 'monoBERT', 'QuestionAnsweringTransformerReranker'] -class T5Reranker(Reranker): +class monoT5(Reranker): def __init__(self, - model: T5ForConditionalGeneration, - tokenizer: QueryDocumentBatchTokenizer): - self.model = model - self.tokenizer = tokenizer + model_name_or_instance: Union[str, T5ForConditionalGeneration] = 'castorini/monot5-base-msmarco', + tokenizer_name_or_instance: Union[str, QueryDocumentBatchTokenizer] = 't5-base'): + if isinstance(model_name_or_instance, str): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model_name_or_instance = T5ForConditionalGeneration.from_pretrained(model_name_or_instance).to(device).eval() + self.model = model_name_or_instance + + if isinstance(tokenizer_name_or_instance, str): + tokenizer_name_or_instance = T5BatchTokenizer(AutoTokenizer.from_pretrained(tokenizer_name_or_instance), batch_size=8) + self.tokenizer = tokenizer_name_or_instance + self.device = next(self.model.parameters(), None).device def rerank(self, query: Query, texts: List[Text]) -> List[Text]: @@ -97,13 +107,20 @@ def rerank(self, query: Query, texts: List[Text]) -> List[Text]: return texts -class SequenceClassificationTransformerReranker(Reranker): +class monoBERT(Reranker): def __init__(self, - model: PreTrainedModel, - tokenizer: PreTrainedTokenizer): - self.tokenizer = tokenizer - self.model = model - self.device = next(model.parameters()).device + model_name_or_instance: Union[str, PreTrainedModel] = 'castorini/monobert-large-msmarco', + tokenizer_name_or_instance: Union[str, PreTrainedTokenizer] = 'bert-large-uncased'): + if isinstance(model_name_or_instance, str): + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + model_name_or_instance = AutoModelForSequenceClassification.from_pretrained(model_name_or_instance).to(device).eval() + self.model = model_name_or_instance + + if isinstance(tokenizer_name_or_instance, str): + tokenizer_name_or_instance = AutoTokenizer.from_pretrained(tokenizer_name_or_instance) + self.tokenizer = tokenizer_name_or_instance + + self.device = next(self.model.parameters(), None).device @torch.no_grad() def rerank(self, query: Query, texts: List[Text]) -> List[Text]: diff --git a/pygaggle/run/evaluate_document_ranker.py b/pygaggle/run/evaluate_document_ranker.py index aa934a43..7e7b4006 100644 --- a/pygaggle/run/evaluate_document_ranker.py +++ b/pygaggle/run/evaluate_document_ranker.py @@ -14,8 +14,8 @@ from pygaggle.rerank.bm25 import Bm25Reranker from pygaggle.rerank.transformer import ( UnsupervisedTransformerReranker, - T5Reranker, - SequenceClassificationTransformerReranker + monoT5, + monoBERT ) from pygaggle.rerank.random import RandomReranker from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider @@ -85,7 +85,7 @@ def construct_t5(options: DocumentRankingEvaluationOptions) -> Reranker: from_tf=options.from_tf).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) - return T5Reranker(model, tokenizer) + return monoT5(model, tokenizer) def construct_transformer(options: @@ -106,7 +106,7 @@ def construct_seq_class_transformer(options: DocumentRankingEvaluationOptions device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name) - return SequenceClassificationTransformerReranker(model, tokenizer) + return monoBERT(model, tokenizer) def construct_bm25(options: DocumentRankingEvaluationOptions) -> Reranker: diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index b9b8b4dc..43cbb421 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -16,8 +16,8 @@ from pygaggle.rerank.bm25 import Bm25Reranker from pygaggle.rerank.transformer import ( QuestionAnsweringTransformerReranker, - SequenceClassificationTransformerReranker, - T5Reranker, + monoBERT, + monoT5, UnsupervisedTransformerReranker ) from pygaggle.rerank.random import RandomReranker @@ -82,7 +82,7 @@ def construct_t5(options: KaggleEvaluationOptions) -> Reranker: tokenizer = AutoTokenizer.from_pretrained( options.model_name, do_lower_case=options.do_lower_case) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) - return T5Reranker(model, tokenizer) + return monoT5(model, tokenizer) def construct_transformer(options: KaggleEvaluationOptions) -> Reranker: @@ -124,7 +124,7 @@ def construct_seq_class_transformer(options: model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained( options.tokenizer_name, do_lower_case=options.do_lower_case) - return SequenceClassificationTransformerReranker(model, tokenizer) + return monoBERT(model, tokenizer) def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index d40010f4..c5098fb9 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -15,8 +15,8 @@ from pygaggle.rerank.bm25 import Bm25Reranker from pygaggle.rerank.transformer import ( UnsupervisedTransformerReranker, - T5Reranker, - SequenceClassificationTransformerReranker + monoT5, + monoBERT ) from pygaggle.rerank.random import RandomReranker from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider @@ -83,7 +83,7 @@ def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker: from_tf=options.from_tf).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) - return T5Reranker(model, tokenizer) + return monoT5(model, tokenizer) def construct_transformer(options: @@ -116,7 +116,7 @@ def construct_seq_class_transformer(options: PassageRankingEvaluationOptions device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name) - return SequenceClassificationTransformerReranker(model, tokenizer) + return monoBERT(model, tokenizer) def construct_bm25(options: PassageRankingEvaluationOptions) -> Reranker: From 5a6a0fb33078ee953b8964ff544275469118f340 Mon Sep 17 00:00:00 2001 From: "Y.J" Date: Sat, 12 Sep 2020 23:51:20 -0400 Subject: [PATCH 3/3] Capitalize class names --- README.md | 8 ++++---- pygaggle/rerank/transformer.py | 12 ++++++------ pygaggle/run/evaluate_document_ranker.py | 8 ++++---- pygaggle/run/evaluate_kaggle_highlighter.py | 8 ++++---- pygaggle/run/evaluate_passage_ranker.py | 8 ++++---- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index d404b3b9..4ec0a780 100644 --- a/README.md +++ b/README.md @@ -36,22 +36,22 @@ Here's how to initalize the T5 reranker from [Document Ranking with a Pretrained ```python from pygaggle.rerank.base import Query, Text -from pygaggle.rerank.transformer import monoT5 +from pygaggle.rerank.transformer import MonoT5 model_name = 'castorini/monot5-base-msmarco' tokenizer_name = 't5-base' -reranker = monoT5(model_name, tokenizer_name) +reranker = MonoT5(model_name, tokenizer_name) ``` Alternatively, here's the BERT reranker from [Passage Re-ranking with BERT](https://arxiv.org/pdf/1901.04085.pdf), which isn't as good as the T5 reranker: ```python from pygaggle.rerank.base import Query, Text -from pygaggle.rerank.transformer import monoBERT +from pygaggle.rerank.transformer import MonoBERT model_name = 'castorini/monobert-large-msmarco' tokenizer_name = 'bert-large-uncased' -reranker = monoBERT(model_name, tokenizer_name) +reranker = MonoBERT(model_name, tokenizer_name) ``` Either way, continue with a complere reranking example: diff --git a/pygaggle/rerank/transformer.py b/pygaggle/rerank/transformer.py index 4b9ae1a0..c69960fd 100644 --- a/pygaggle/rerank/transformer.py +++ b/pygaggle/rerank/transformer.py @@ -19,15 +19,15 @@ greedy_decode) -__all__ = ['monoT5', +__all__ = ['MonoT5', 'UnsupervisedTransformerReranker', - 'monoBERT', + 'MonoBERT', 'QuestionAnsweringTransformerReranker'] -class monoT5(Reranker): +class MonoT5(Reranker): def __init__(self, - model_name_or_instance: Union[str, T5ForConditionalGeneration] = 'castorini/monot5-base-msmarco', + model_name_or_instance: Union[str, T5ForConditionalGeneration] = 'castorini/monoT5-base-msmarco', tokenizer_name_or_instance: Union[str, QueryDocumentBatchTokenizer] = 't5-base'): if isinstance(model_name_or_instance, str): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') @@ -107,9 +107,9 @@ def rerank(self, query: Query, texts: List[Text]) -> List[Text]: return texts -class monoBERT(Reranker): +class MonoBERT(Reranker): def __init__(self, - model_name_or_instance: Union[str, PreTrainedModel] = 'castorini/monobert-large-msmarco', + model_name_or_instance: Union[str, PreTrainedModel] = 'castorini/monoBERT-large-msmarco', tokenizer_name_or_instance: Union[str, PreTrainedTokenizer] = 'bert-large-uncased'): if isinstance(model_name_or_instance, str): device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') diff --git a/pygaggle/run/evaluate_document_ranker.py b/pygaggle/run/evaluate_document_ranker.py index 7e7b4006..c6a27a34 100644 --- a/pygaggle/run/evaluate_document_ranker.py +++ b/pygaggle/run/evaluate_document_ranker.py @@ -14,8 +14,8 @@ from pygaggle.rerank.bm25 import Bm25Reranker from pygaggle.rerank.transformer import ( UnsupervisedTransformerReranker, - monoT5, - monoBERT + MonoT5, + MonoBERT ) from pygaggle.rerank.random import RandomReranker from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider @@ -85,7 +85,7 @@ def construct_t5(options: DocumentRankingEvaluationOptions) -> Reranker: from_tf=options.from_tf).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) - return monoT5(model, tokenizer) + return MonoT5(model, tokenizer) def construct_transformer(options: @@ -106,7 +106,7 @@ def construct_seq_class_transformer(options: DocumentRankingEvaluationOptions device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name) - return monoBERT(model, tokenizer) + return MonoBERT(model, tokenizer) def construct_bm25(options: DocumentRankingEvaluationOptions) -> Reranker: diff --git a/pygaggle/run/evaluate_kaggle_highlighter.py b/pygaggle/run/evaluate_kaggle_highlighter.py index 43cbb421..c9ba32ca 100644 --- a/pygaggle/run/evaluate_kaggle_highlighter.py +++ b/pygaggle/run/evaluate_kaggle_highlighter.py @@ -16,8 +16,8 @@ from pygaggle.rerank.bm25 import Bm25Reranker from pygaggle.rerank.transformer import ( QuestionAnsweringTransformerReranker, - monoBERT, - monoT5, + MonoBERT, + MonoT5, UnsupervisedTransformerReranker ) from pygaggle.rerank.random import RandomReranker @@ -82,7 +82,7 @@ def construct_t5(options: KaggleEvaluationOptions) -> Reranker: tokenizer = AutoTokenizer.from_pretrained( options.model_name, do_lower_case=options.do_lower_case) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) - return monoT5(model, tokenizer) + return MonoT5(model, tokenizer) def construct_transformer(options: KaggleEvaluationOptions) -> Reranker: @@ -124,7 +124,7 @@ def construct_seq_class_transformer(options: model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained( options.tokenizer_name, do_lower_case=options.do_lower_case) - return monoBERT(model, tokenizer) + return MonoBERT(model, tokenizer) def construct_qa_transformer(options: KaggleEvaluationOptions) -> Reranker: diff --git a/pygaggle/run/evaluate_passage_ranker.py b/pygaggle/run/evaluate_passage_ranker.py index c5098fb9..3121b338 100644 --- a/pygaggle/run/evaluate_passage_ranker.py +++ b/pygaggle/run/evaluate_passage_ranker.py @@ -15,8 +15,8 @@ from pygaggle.rerank.bm25 import Bm25Reranker from pygaggle.rerank.transformer import ( UnsupervisedTransformerReranker, - monoT5, - monoBERT + MonoT5, + MonoBERT ) from pygaggle.rerank.random import RandomReranker from pygaggle.rerank.similarity import CosineSimilarityMatrixProvider @@ -83,7 +83,7 @@ def construct_t5(options: PassageRankingEvaluationOptions) -> Reranker: from_tf=options.from_tf).to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.model_type) tokenizer = T5BatchTokenizer(tokenizer, options.batch_size) - return monoT5(model, tokenizer) + return MonoT5(model, tokenizer) def construct_transformer(options: @@ -116,7 +116,7 @@ def construct_seq_class_transformer(options: PassageRankingEvaluationOptions device = torch.device(options.device) model = model.to(device).eval() tokenizer = AutoTokenizer.from_pretrained(options.tokenizer_name) - return monoBERT(model, tokenizer) + return MonoBERT(model, tokenizer) def construct_bm25(options: PassageRankingEvaluationOptions) -> Reranker: