PaddlePaddle · w5688414 · Sep 29, 2022 · Mar 18, 2022 · Mar 18, 2022 · Mar 18, 2022
diff --git a/examples/semantic_indexing/NQdataset.py b/examples/semantic_indexing/NQdataset.py
@@ -0,0 +1,228 @@
+import random
+
+import paddle
+from paddle.io import Dataset
+import json
+from paddlenlp.transformers.bert.tokenizer import BertTokenizer
+import collections
+from typing import Dict, List, Tuple
+import numpy as np
+BertTokenizer.pad_token_type_id
+
+BiEncoderPassage = collections.namedtuple("BiEncoderPassage", ["text", "title"])
+
+BiENcoderBatch = collections.namedtuple(
+    "BiEncoderInput",
+    [
+        "questions_ids",
+        "question_segments",
+        "context_ids",
+        "ctx_segments",
+        "is_positive",
+        "hard_negatives",
+        "encoder_type",
+    ]
+)
+
+def normalize_question(question: str) -> str:
+    question = question.replace("’", "'")
+    return question
+
+def normalize_passage(ctx_text: str):
+    ctx_text = ctx_text.replace("\n", " ").replace("’", "'")
+    if ctx_text.startswith('"'):
+        ctx_text = ctx_text[1:]
+    if ctx_text.endswith('"'):
+        ctx_text = ctx_text[:-1]
+    return ctx_text
+
+
+class BiEncoderSample(object):
+    query: str
+    positive_passages: List[BiEncoderPassage]
+    negative_passages: List[BiEncoderPassage]
+    hard_negative_passages: List[BiEncoderPassage]
+
+class NQdataSetForDPR(Dataset):
+    def __init__(self, dataPath, query_special_suffix=None):
+        super(NQdataSetForDPR, self).__init__()
+        self.data = self._read_json_data(dataPath)
+        self.tokenizer = BertTokenizer
+        self.query_special_suffix = query_special_suffix
+        self.new_data = []
+        for i in range(0, self.lens()):
+            self.new_data.append(self.__getitem__(i))
+    def _read_json_data(self,dataPath):
+        results = []
+        with open(dataPath, "r", encoding="utf-8") as f:
+            print("Reading file %s" % dataPath)
+            data = json.load(f)
+            results.extend(data)
+            print("Aggregated data size: {}".format(len(results)))
+        return results
+    def __getitem__(self, index):
+        json_sample_data = self.data[index]
+        r = BiEncoderSample()
+        r.query = self._porcess_query(json_sample_data["question"])
+
+        positive_ctxs = json_sample_data["positive_ctxs"]
+
+        negative_ctxs = json_sample_data["negative_ctxs"] if "negative_ctxs" in json_sample_data else []
+        hard_negative_ctxs = json_sample_data["hard_negative_ctxs"] if "hard_negative_ctxs" in json_sample_data else []
+
+        for ctx in positive_ctxs + negative_ctxs + hard_negative_ctxs:
+            if "title" not in ctx:
+                ctx["title"] = None
+
+        def create_passage(ctx):
+            return BiEncoderPassage(
+                normalize_passage(ctx["text"]),
+                ctx["title"]
+            )
+
+        r.positive_passages = [create_passage(ctx) for ctx in positive_ctxs]
+        r.negative_passages = [create_passage(ctx) for ctx in negative_ctxs]
+        r.hard_negative_passages = [create_passage(ctx) for ctx in hard_negative_ctxs]
+
+        return r
+
+    def _porcess_query(self,query):
+        query = normalize_question(query)
+
+        if self.query_special_suffix and not query.endswith(self.query_special_suffix):
+            query += self.query_special_suffix
+
+        return query
+
+
+    def __len__(self):
+        return len(self.data)
+
+class DataUtil():
+    def __init__(self):
+        self.tensorizer = BertTensorizer()
+
+    def create_biencoder_input(self,
+                               samples : List[BiEncoderSample],
+                               inserted_title,
+                               num_hard_negatives=0,
+                               num_other_negatives=0,
+                               shuffle=True,
+                               shuffle_positives=False,
+                               hard_neg_positives=False,
+                               hard_neg_fallback=True,
+                               query_token=None):
+
+        question_tensors = []
+        ctx_tensors = []
+        positive_ctx_indices = []
+        hard_neg_ctx_indices = []
+
+        for sample in samples:
+
+            if shuffle and shuffle_positives:
+                positive_ctxs = sample.positive_passages
+                positive_ctx = positive_ctxs[np.random.choice(len(positive_ctxs))]
+            else:
+                positive_ctx = sample.positive_passages[0]
+
+            neg_ctxs = sample.negative_passages
+            hard_neg_ctxs = sample.hard_negative_passages
+            question = sample.query
+
+            if shuffle:
+                random.shuffle(neg_ctxs)
+                random.shuffle(hard_neg_ctxs)
+
+            if hard_neg_fallback and len(hard_neg_ctxs) == 0:
+                hard_neg_ctxs = neg_ctxs[0:num_hard_negatives]
+
+            neg_ctxs = neg_ctxs[0:num_other_negatives]
+            hard_neg_ctxs = hard_neg_ctxs[0:num_hard_negatives]
+
+            all_ctxs = [positive_ctx] + neg_ctxs + hard_neg_ctxs
+            hard_negative_start_idx = 1
+            hard_negative_end_idx = 1 + len(hard_neg_ctxs)
+
+            current_ctxs_len = len(ctx_tensors)
+
+            sample_ctxs_tensors = [
+                self.tensorizer.text_to_tensor(ctx.text,title=ctx.title if (inserted_title and ctx.title) else None)
+                for ctx in all_ctxs
+            ]
+
+            ctx_tensors.extend(sample_ctxs_tensors)
+            positive_ctx_indices.append(current_ctxs_len)
+            hard_neg_ctx_indices.append(
+                i
+                for i in range(
+                    current_ctxs_len + hard_negative_start_idx,
+                    current_ctxs_len + hard_negative_end_idx,
+                )
+            )
+
+            """if query_token:
+                if query_token == "[START_END]":
+                    query_span = _select_span
+                else:
+                    question_tensors.append(self.tensorizer.text_to_tensor(" ".join([query_token, question])))
+            else:"""
+
+            question_tensors.append(self.tensorizer.text_to_tensor(question))
+
+        ctxs_tensor = paddle.concat([paddle.reshape(ctx,[1,-1]) for ctx in ctx_tensors],axis=0)
+        questions_tensor = paddle.concat([paddle.reshape(q,[1,-1]) for q in question_tensors],axis=0)
+
+        ctx_segments = paddle.zeros_like(ctxs_tensor)
+        question_segments = paddle.zeros_like(questions_tensor)
+
+        return BiENcoderBatch(
+            questions_tensor,
+            question_segments,
+            ctxs_tensor,
+            ctx_segments,
+            positive_ctx_indices,
+            hard_neg_ctx_indices,
+            "question",
+        )
+
+
+class BertTensorizer():
+    def __init__(self,max_length:int,pad_to_max=True):
+        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        self.max_length = max_length
+        self.pad_to_max = pad_to_max
+
+    def text_to_tensor(self,
+                       text:str,
+                       title=None,
+                       add_special_tokens=True,
+                       apply_max_len=True):
+        text = text.strip()
+
+        if title:
+            token_ids = self.tokenizer.encode(
+                title,
+                text_pair=text,
+                add_special_tokens=add_special_tokens,
+                max_seq_len=self.max_length if apply_max_len else 10000,
+                pad_to_max_seq_len=False,
+                truncation_strategy=True,
+            )
+        else:
+            token_ids = self.tokenizer.encode(
+                text,
+                add_special_tokens=add_special_tokens,
+                max_seq_len=self.max_length if apply_max_len else 10000,
+                pad_to_max_seq_len=False,
+                truncation_strategy=True,
+            )
+
+        seq_len = self.max_length
+        if self.pad_to_max and len(token_ids) < seq_len:
+            token_ids = token_ids + [self.tokenizer.pad_token_type_id] * (seq_len - len(token_ids))
+        if len(token_ids) >= seq_len:
+            token_ids = token_ids[0:seq_len] if apply_max_len else token_ids
+            token_ids[-1] = self.tokenizer.pad_token_type_id
+
+        return paddle.to_tensor(token_ids)
diff --git a/examples/semantic_indexing/README.md b/examples/semantic_indexing/README.md
@@ -78,7 +78,7 @@
 以我们提供的语义相似度训练数据为例，通过如下命令，指定 GPU 0,1,2,3 卡, 基于 In-batch negatives 策略开始训练模型
 
 ```
-python -u -m paddle.distributed.launch --gpus "0,1,2,3" \
+python -u -m paddle.distributed.launch --gpus "0" \
     train_batch_neg.py \
     --device gpu \
     --save_dir ./checkpoints/ \
@@ -144,7 +144,7 @@ python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \
         --device gpu \
         --recall_result_dir "recall_result_dir" \
         --recall_result_file "recall_result.txt" \
-        --params_path "${checkpoints_params_file}" \
+        --params_path "./temp10/model_state.pdparams" \
         --hnsw_m 100 \
         --hnsw_ef 100 \
         --batch_size 64 \
@@ -178,7 +178,7 @@ python -u -m paddle.distributed.launch --gpus "0" --log_dir "recall_log/" \
 接下来，运行如下命令进行效果评估，产出 R@10 和 R@50 指标:
 ```
   python -u evaluate.py \
-        --similar_pair_file "semantic_similar_pair.tsv" \
+        --similar_text_pair "semantic_similar_pair.tsv" \
         --recall_result_file "./recall_result_dir/recall_result.txt" \
         --recall_num 50
 ```

diff --git a/examples/semantic_indexing/README_gradient_cache.md b/examples/semantic_indexing/README_gradient_cache.md
@@ -0,0 +1,6 @@
+### “数据准备”参考facebook仓库download_data.py下载即可，下载后放到对应文件夹里。
+### ”模型训练“运行train_gradient_cache_DPR.py即可，需按照原始论文仓库中的最佳训练策略设置参数，并设置模型保存位置。
+### ”效果评估“先运行generate_dense_embeddings.py文件，之后运行dense_retriever.py文件即可。
+### 效果评估相关文件从facebook的库中取得，将其中用torch实现的过程改为了用paddle实现
+##上述文件的运行参数与DPR原库一致
+##train里面把global的量改成参数就行
diff --git a/examples/semantic_indexing/biencoder_base_model.py b/examples/semantic_indexing/biencoder_base_model.py
@@ -0,0 +1,85 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+class BiEncoder(nn.Layer):
+    def __init__(self,question_encoder,context_encoder,dropout,output_emb_size = 768,state=None):
+        super(BiEncoder, self).__init__()
+        self.state = state
+        if self.state == None:
+            self.question_encoder = question_encoder
+            self.context_encoder = context_encoder
+        elif self.state == "FORQUESTION":
+            self.question_encoder = question_encoder
+        elif self.state == "FORCONTEXT":
+            self.context_encoder = context_encoder
+        self.dropout = nn.Dropout(dropout if dropout is not None else 0.1)
+        weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.TruncatedNormal(std=0.02))
+        self.emb_reduce_linear = paddle.nn.Linear(
+            768, output_emb_size, weight_attr=weight_attr)
+
+    def get_question_pooled_embedding(self,
+                             input_ids,
+                             token_type_ids=None,
+                             position_ids=None,
+                             attention_mask=None):
+
+        _, cls_embedding = self.question_encoder(input_ids, token_type_ids, position_ids,attention_mask)
+
+        """cls_embedding = self.emb_reduce_linear(cls_embedding)
+        cls_embedding = self.dropout(cls_embedding)
+        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)"""
+
+        return cls_embedding
+
+    def get_context_pooled_embedding(self,
+                             input_ids,
+                             token_type_ids=None,
+                             position_ids=None,
+                             attention_mask=None):
+
+        _, cls_embedding = self.context_encoder(input_ids, token_type_ids, position_ids,attention_mask)
+
+        """cls_embedding = self.emb_reduce_linear(cls_embedding)
+        cls_embedding = self.dropout(cls_embedding)
+        cls_embedding = F.normalize(cls_embedding, p=2, axis=-1)"""
+
+        return cls_embedding
+
+    def forward(self,
+                question_id,
+                question_segments,
+                question_attn_mask,
+                context_ids,
+                context_segments,
+                context_attn_mask,
+                    ):
+
+        question_pooled_out = self.get_question_pooled_embedding(question_id,question_segments,question_attn_mask)
+        context_pooled_out = self.get_context_pooled_embedding(context_ids,context_segments,context_attn_mask)
+
+        return question_pooled_out,context_pooled_out
+
+class BiEncoderNllLoss(object):
+    def calc(self,
+             q_vectors,
+             ctx_vectors,
+             positive_idx_per_question,
+             loss_scale=None):
+        scorces = paddle.matmul(q_vectors,paddle.transpose(ctx_vectors,[0,1]))
+
+        if len(q_vectors.size()) > 1:
+            q_num = q_vectors.size(0)
+            scores = scorces.view(q_num, -1)
+
+        softmax_scorces = F.log_softmax(scores,axis=1)
+
+        loss = F.nll_loss(softmax_scorces,paddle.to_tensor(positive_idx_per_question))
+
+        max_score = paddle.max(softmax_scorces,axis=1)
+        correct_predictions_count = (None)
+
+        if loss_scale:
+            loss.mul_(loss_scale)
+
+        return loss,correct_predictions_count
diff --git a/examples/semantic_indexing/dense_passage_retrieval/model.py b/examples/semantic_indexing/dense_passage_retrieval/model.py