-
Notifications
You must be signed in to change notification settings - Fork 3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
【Hackathon + GradientCache】 #1799
Changes from 25 commits
784f345
5c9a957
01f1bc0
fa610bc
c430b8a
8f420d1
8f3a97c
a26786d
2227a26
4510e1d
f86eeb9
d129186
483cb62
5e91937
bba0521
ae0125e
ff3789c
ff34e2a
ccbd5b1
202664a
b7a6db3
e675ea9
17be523
43acadb
4563c2d
c892976
c600939
87f029a
25aa42c
d5984a1
c7fdafd
8012929
7650da6
675efb6
d890d8e
88ba024
abff61d
1cd93be
162165c
9cbcd71
e533e10
67bad62
d57380c
748b63f
be889df
2f0901d
f2a4397
de9ba83
db2ccf0
476aaa5
f6716fb
6343cf7
25c0b2a
644438d
7ccabad
865d50c
f5a9606
0aa9739
3891997
ed675ec
152437f
2c57eb6
2fbfde8
fb38a58
ab0f9d1
8f335c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,228 @@ | ||
import random | ||
|
||
import paddle | ||
from paddle.io import Dataset | ||
import json | ||
from paddlenlp.transformers.bert.tokenizer import BertTokenizer | ||
import collections | ||
from typing import Dict, List, Tuple | ||
import numpy as np | ||
BertTokenizer.pad_token_type_id | ||
|
||
BiEncoderPassage = collections.namedtuple("BiEncoderPassage", ["text", "title"]) | ||
|
||
BiENcoderBatch = collections.namedtuple( | ||
"BiEncoderInput", | ||
[ | ||
"questions_ids", | ||
"question_segments", | ||
"context_ids", | ||
"ctx_segments", | ||
"is_positive", | ||
"hard_negatives", | ||
"encoder_type", | ||
] | ||
) | ||
|
||
def normalize_question(question: str) -> str: | ||
question = question.replace("’", "'") | ||
return question | ||
|
||
def normalize_passage(ctx_text: str): | ||
ctx_text = ctx_text.replace("\n", " ").replace("’", "'") | ||
if ctx_text.startswith('"'): | ||
ctx_text = ctx_text[1:] | ||
if ctx_text.endswith('"'): | ||
ctx_text = ctx_text[:-1] | ||
return ctx_text | ||
|
||
|
||
class BiEncoderSample(object): | ||
query: str | ||
positive_passages: List[BiEncoderPassage] | ||
negative_passages: List[BiEncoderPassage] | ||
hard_negative_passages: List[BiEncoderPassage] | ||
|
||
class NQdataSetForDPR(Dataset): | ||
def __init__(self, dataPath, query_special_suffix=None): | ||
super(NQdataSetForDPR, self).__init__() | ||
self.data = self._read_json_data(dataPath) | ||
self.tokenizer = BertTokenizer | ||
self.query_special_suffix = query_special_suffix | ||
self.new_data = [] | ||
for i in range(0, self.lens()): | ||
self.new_data.append(self.__getitem__(i)) | ||
def _read_json_data(self,dataPath): | ||
results = [] | ||
with open(dataPath, "r", encoding="utf-8") as f: | ||
print("Reading file %s" % dataPath) | ||
data = json.load(f) | ||
results.extend(data) | ||
print("Aggregated data size: {}".format(len(results))) | ||
return results | ||
def __getitem__(self, index): | ||
json_sample_data = self.data[index] | ||
r = BiEncoderSample() | ||
r.query = self._porcess_query(json_sample_data["question"]) | ||
|
||
positive_ctxs = json_sample_data["positive_ctxs"] | ||
|
||
negative_ctxs = json_sample_data["negative_ctxs"] if "negative_ctxs" in json_sample_data else [] | ||
hard_negative_ctxs = json_sample_data["hard_negative_ctxs"] if "hard_negative_ctxs" in json_sample_data else [] | ||
|
||
for ctx in positive_ctxs + negative_ctxs + hard_negative_ctxs: | ||
if "title" not in ctx: | ||
ctx["title"] = None | ||
|
||
def create_passage(ctx): | ||
return BiEncoderPassage( | ||
normalize_passage(ctx["text"]), | ||
ctx["title"] | ||
) | ||
|
||
r.positive_passages = [create_passage(ctx) for ctx in positive_ctxs] | ||
r.negative_passages = [create_passage(ctx) for ctx in negative_ctxs] | ||
r.hard_negative_passages = [create_passage(ctx) for ctx in hard_negative_ctxs] | ||
|
||
return r | ||
|
||
def _porcess_query(self,query): | ||
query = normalize_question(query) | ||
|
||
if self.query_special_suffix and not query.endswith(self.query_special_suffix): | ||
query += self.query_special_suffix | ||
|
||
return query | ||
|
||
|
||
def __len__(self): | ||
return len(self.data) | ||
|
||
class DataUtil(): | ||
def __init__(self): | ||
self.tensorizer = BertTensorizer() | ||
|
||
def create_biencoder_input(self, | ||
samples : List[BiEncoderSample], | ||
inserted_title, | ||
num_hard_negatives=0, | ||
num_other_negatives=0, | ||
shuffle=True, | ||
shuffle_positives=False, | ||
hard_neg_positives=False, | ||
hard_neg_fallback=True, | ||
query_token=None): | ||
|
||
question_tensors = [] | ||
ctx_tensors = [] | ||
positive_ctx_indices = [] | ||
hard_neg_ctx_indices = [] | ||
|
||
for sample in samples: | ||
|
||
if shuffle and shuffle_positives: | ||
positive_ctxs = sample.positive_passages | ||
positive_ctx = positive_ctxs[np.random.choice(len(positive_ctxs))] | ||
else: | ||
positive_ctx = sample.positive_passages[0] | ||
|
||
neg_ctxs = sample.negative_passages | ||
hard_neg_ctxs = sample.hard_negative_passages | ||
question = sample.query | ||
|
||
if shuffle: | ||
random.shuffle(neg_ctxs) | ||
random.shuffle(hard_neg_ctxs) | ||
|
||
if hard_neg_fallback and len(hard_neg_ctxs) == 0: | ||
hard_neg_ctxs = neg_ctxs[0:num_hard_negatives] | ||
|
||
neg_ctxs = neg_ctxs[0:num_other_negatives] | ||
hard_neg_ctxs = hard_neg_ctxs[0:num_hard_negatives] | ||
|
||
all_ctxs = [positive_ctx] + neg_ctxs + hard_neg_ctxs | ||
hard_negative_start_idx = 1 | ||
hard_negative_end_idx = 1 + len(hard_neg_ctxs) | ||
|
||
current_ctxs_len = len(ctx_tensors) | ||
|
||
sample_ctxs_tensors = [ | ||
self.tensorizer.text_to_tensor(ctx.text,title=ctx.title if (inserted_title and ctx.title) else None) | ||
for ctx in all_ctxs | ||
] | ||
|
||
ctx_tensors.extend(sample_ctxs_tensors) | ||
positive_ctx_indices.append(current_ctxs_len) | ||
hard_neg_ctx_indices.append( | ||
i | ||
for i in range( | ||
current_ctxs_len + hard_negative_start_idx, | ||
current_ctxs_len + hard_negative_end_idx, | ||
) | ||
) | ||
|
||
"""if query_token: | ||
if query_token == "[START_END]": | ||
query_span = _select_span | ||
else: | ||
question_tensors.append(self.tensorizer.text_to_tensor(" ".join([query_token, question]))) | ||
else:""" | ||
|
||
question_tensors.append(self.tensorizer.text_to_tensor(question)) | ||
|
||
ctxs_tensor = paddle.concat([paddle.reshape(ctx,[1,-1]) for ctx in ctx_tensors],axis=0) | ||
questions_tensor = paddle.concat([paddle.reshape(q,[1,-1]) for q in question_tensors],axis=0) | ||
|
||
ctx_segments = paddle.zeros_like(ctxs_tensor) | ||
question_segments = paddle.zeros_like(questions_tensor) | ||
|
||
return BiENcoderBatch( | ||
questions_tensor, | ||
question_segments, | ||
ctxs_tensor, | ||
ctx_segments, | ||
positive_ctx_indices, | ||
hard_neg_ctx_indices, | ||
"question", | ||
) | ||
|
||
|
||
class BertTensorizer(): | ||
def __init__(self,max_length:int,pad_to_max=True): | ||
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") | ||
self.max_length = max_length | ||
self.pad_to_max = pad_to_max | ||
|
||
def text_to_tensor(self, | ||
text:str, | ||
title=None, | ||
add_special_tokens=True, | ||
apply_max_len=True): | ||
text = text.strip() | ||
|
||
if title: | ||
token_ids = self.tokenizer.encode( | ||
title, | ||
text_pair=text, | ||
add_special_tokens=add_special_tokens, | ||
max_seq_len=self.max_length if apply_max_len else 10000, | ||
pad_to_max_seq_len=False, | ||
truncation_strategy=True, | ||
) | ||
else: | ||
token_ids = self.tokenizer.encode( | ||
text, | ||
add_special_tokens=add_special_tokens, | ||
max_seq_len=self.max_length if apply_max_len else 10000, | ||
pad_to_max_seq_len=False, | ||
truncation_strategy=True, | ||
) | ||
|
||
seq_len = self.max_length | ||
if self.pad_to_max and len(token_ids) < seq_len: | ||
token_ids = token_ids + [self.tokenizer.pad_token_type_id] * (seq_len - len(token_ids)) | ||
if len(token_ids) >= seq_len: | ||
token_ids = token_ids[0:seq_len] if apply_max_len else token_ids | ||
token_ids[-1] = self.tokenizer.pad_token_type_id | ||
|
||
return paddle.to_tensor(token_ids) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -78,7 +78,7 @@ | |
以我们提供的语义相似度训练数据为例,通过如下命令,指定 GPU 0,1,2,3 卡, 基于 In-batch negatives 策略开始训练模型 | ||
|
||
``` | ||
python -u -m paddle.distributed.launch --gpus "0,1,2,3" \ | ||
python -u -m paddle.distributed.launch --gpus "0" \ | ||
train_batch_neg.py \ | ||
--device gpu \ | ||
--save_dir ./checkpoints/ \ | ||
|
@@ -144,7 +144,7 @@ python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \ | |
--device gpu \ | ||
--recall_result_dir "recall_result_dir" \ | ||
--recall_result_file "recall_result.txt" \ | ||
--params_path "${checkpoints_params_file}" \ | ||
--params_path "./temp10/model_state.pdparams" \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 避免硬编码 |
||
--hnsw_m 100 \ | ||
--hnsw_ef 100 \ | ||
--batch_size 64 \ | ||
|
@@ -178,7 +178,7 @@ python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \ | |
接下来,运行如下命令进行效果评估,产出 R@10 和 R@50 指标: | ||
``` | ||
python -u evaluate.py \ | ||
--similar_pair_file "semantic_similar_pair.tsv" \ | ||
--similar_text_pair "semantic_similar_pair.tsv" \ | ||
--recall_result_file "./recall_result_dir/recall_result.txt" \ | ||
--recall_num 50 | ||
``` | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
### “数据准备”参考facebook仓库download_data.py下载即可,下载后放到对应文件夹里。 | ||
### ”模型训练“运行train_gradient_cache_DPR.py即可,需按照原始论文仓库中的最佳训练策略设置参数,并设置模型保存位置。 | ||
### ”效果评估“先运行generate_dense_embeddings.py文件,之后运行dense_retriever.py文件即可。 | ||
### 效果评估相关文件从facebook的库中取得,将其中用torch实现的过程改为了用paddle实现 | ||
##上述文件的运行参数与DPR原库一致 | ||
##train里面把global的量改成参数就行 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 能否参考 https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/semantic_indexing README 按照规范写一下 Gradient_Cache 的文档? |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
import paddle | ||
import paddle.nn as nn | ||
import paddle.nn.functional as F | ||
|
||
class BiEncoder(nn.Layer): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. python文件加入Paddle的版权说明。
|
||
def __init__(self,question_encoder,context_encoder,dropout,output_emb_size = 768,state=None): | ||
super(BiEncoder, self).__init__() | ||
self.state = state | ||
if self.state == None: | ||
self.question_encoder = question_encoder | ||
self.context_encoder = context_encoder | ||
elif self.state == "FORQUESTION": | ||
self.question_encoder = question_encoder | ||
elif self.state == "FORCONTEXT": | ||
self.context_encoder = context_encoder | ||
self.dropout = nn.Dropout(dropout if dropout is not None else 0.1) | ||
weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02)) | ||
self.emb_reduce_linear = paddle.nn.Linear( | ||
768, output_emb_size, weight_attr=weight_attr) | ||
|
||
def get_question_pooled_embedding(self, | ||
input_ids, | ||
token_type_ids=None, | ||
position_ids=None, | ||
attention_mask=None): | ||
|
||
_, cls_embedding = self.question_encoder(input_ids, token_type_ids, position_ids,attention_mask) | ||
|
||
"""cls_embedding = self.emb_reduce_linear(cls_embedding) | ||
cls_embedding = self.dropout(cls_embedding) | ||
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)""" | ||
|
||
return cls_embedding | ||
|
||
def get_context_pooled_embedding(self, | ||
input_ids, | ||
token_type_ids=None, | ||
position_ids=None, | ||
attention_mask=None): | ||
|
||
_, cls_embedding = self.context_encoder(input_ids, token_type_ids, position_ids,attention_mask) | ||
|
||
"""cls_embedding = self.emb_reduce_linear(cls_embedding) | ||
cls_embedding = self.dropout(cls_embedding) | ||
cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)""" | ||
|
||
return cls_embedding | ||
|
||
def forward(self, | ||
question_id, | ||
question_segments, | ||
question_attn_mask, | ||
context_ids, | ||
context_segments, | ||
context_attn_mask, | ||
): | ||
|
||
question_pooled_out = self.get_question_pooled_embedding(question_id,question_segments,question_attn_mask) | ||
context_pooled_out = self.get_context_pooled_embedding(context_ids,context_segments,context_attn_mask) | ||
|
||
return question_pooled_out,context_pooled_out | ||
|
||
class BiEncoderNllLoss(object): | ||
def calc(self, | ||
q_vectors, | ||
ctx_vectors, | ||
positive_idx_per_question, | ||
loss_scale=None): | ||
scorces = paddle.matmul(q_vectors,paddle.transpose(ctx_vectors,[0,1])) | ||
|
||
if len(q_vectors.size()) > 1: | ||
q_num = q_vectors.size(0) | ||
scores = scorces.view(q_num, -1) | ||
|
||
softmax_scorces = F.log_softmax(scores,axis=1) | ||
|
||
loss = F.nll_loss(softmax_scorces,paddle.to_tensor(positive_idx_per_question)) | ||
|
||
max_score = paddle.max(softmax_scorces,axis=1) | ||
correct_predictions_count = (None) | ||
|
||
if loss_scale: | ||
loss.mul_(loss_scale) | ||
|
||
return loss,correct_predictions_count |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
可以新增 GradientCache 的训练命令,不用修改已有的训练命令。