Merge pull request #5 from lonePatient/dev-0.8.1

update version_0.8.1
lonePatient · Mar 4, 2023 · 0964e05 · 0964e05
2 parents 5e94106 + 5dfaa4a
commit 0964e05
Show file tree

Hide file tree

Showing 216 changed files with 11,125 additions and 3,826 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/LICENSE b/LICENSE
diff --git a/README.md b/README.md
@@ -4,12 +4,18 @@
 
 A PyTorch-based toolkit for natural language processing
 
+![python](https://img.shields.io/badge/-Python_3.7_%7C_3.8_%7C_3.9_%7C_3.10-blue?logo=python&logoColor=white)
+![pytorch](https://img.shields.io/badge/PyTorch_1.10+-ee4c2c?logo=pytorch&logoColor=white)
+![black](https://img.shields.io/badge/Code%20Style-Black-black.svg?labelColor=gray)
+![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)
 
 ### Requirements
 
-- torch>=1.6.0
-- transformers>=4.1.1
-- torchmetrics>=0.6.0
+- torch>=1.10.0
+- tokenizers >= 0.7.0
+- transformers>=4.10.0
+- torchmetrics>=0.11.3
+
 
 TorchBlocks requires Python 3.7+. We recommend installing TorchBlocks in a Linux or OSX environment.
 
@@ -22,6 +28,7 @@ git clone https://github.com/lonePatient/TorchBlocks.git
 cd TorchBlocks
 python setup.py install
 ```
+⚠️**Note:** This project is still in the development stage and some of the interfaces are subject to change.
 
 ### Tutorials
 
@@ -30,5 +37,7 @@ python setup.py install
 * Tutorial 3 (sequence labeling): [task_sequence_labeling_ner_crf.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_ner_crf.py)
 * Tutorial 4 (sentence similarity): [task_sentence_similarity_lcqmc.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sentence_similarity_lcqmc.py)
 * Tutorial 5 (triple similarity): [task_triple_similarity_epidemic.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_triple_similarity_epidemic.py)
+* Tutorial 6 (sequence labeling): [task_sequence_labeling_resume_beam_search_softmax.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_resume_beam_search_softmax.py)
+* Tutorual 7 (sequence labeling): [task_sequence_labeling_resume_global_pointer.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_resume_global_pointer.py)
 * Example scripts for each task: [TorchBlocks/examples/](https://github.com/lonePatient/TorchBlocks/tree/master/examples)
 
diff --git a/...blocks/metrics/classification/__init__.py → docs/__init__.py b/...blocks/metrics/classification/__init__.py → docs/__init__.py
diff --git a/docs/apex_install b/docs/apex_install
@@ -0,0 +1,4 @@
+$ git clone https://github.com/NVIDIA/apex
+$ sed -i "s/or (bare_metal_minor != torch_binary_minor)//g" apex/setup.py
+$ pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" apex/
+$ rm -rf apex
diff --git a/examples/README.md b/examples/README.md
diff --git a/examples/ccks_kfold_split.py b/examples/ccks_kfold_split.py
@@ -0,0 +1,48 @@
+from torchblocks.utils import json_to_text
+from sklearn.model_selection import StratifiedKFold
+
+
+def get_data(data_path, datatype):
+    data = []
+    if datatype == 'train':
+        with open(data_path) as f:
+            for i in f:
+                dict_txt = eval(i)
+                if dict_txt['query'] == '':
+                    continue
+                for j in dict_txt['candidate']:
+                    if j['text'] == '':
+                        continue
+                    data.append({'query': dict_txt['query'], 'candidate': j['text'], 'label': j['label']})
+    else:
+        with open(data_path) as f:
+            for i in f:
+                dict_txt = eval(i)
+                for j in dict_txt['candidate']:
+                    data.append({'text_id': dict_txt['text_id'], 'query': dict_txt['query'], 'candidate': j['text']})
+    return data
+
+
+def generate_data(train_data, random_state=42):
+    X = range(len(train_data))
+    y = [x['label'] for x in train_data]
+    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
+    for fold, (train_index, dev_index) in enumerate(skf.split(X, y)):
+        tmp_train_df = [train_data[index] for index in train_index]
+        tmp_dev_df = [train_data[index] for index in dev_index]
+        json_to_text(f'../dataset/ccks2021/ccks2021_train_seed{random_state}_fold{fold}.json', tmp_train_df)
+        json_to_text(f'../dataset/ccks2021/ccks2021_dev_seed{random_state}_fold{fold}.json', tmp_dev_df)
+
+
+if __name__ == '__main__':
+    seed = 42
+    train_path1 = '../dataset/ccks2021/round1_train.txt'
+    train_path2 = '../dataset/ccks2021/round2_train.txt'
+    train_data1 = get_data(train_path1, 'train')
+    train_data2 = get_data(train_path2, 'train')
+    train_data = train_data1
+    train_data.extend(train_data2)
+    generate_data(train_data, 42)
+    generate_data(train_data, 24)
+    generate_data(train_data, 33)
+    print('...............kf finish...........')
diff --git a/examples/cner_kfold_split.py b/examples/cner_kfold_split.py
@@ -0,0 +1,57 @@
+from torchblocks.utils import json_to_text
+from torchblocks.tasks import get_spans_from_bio_tags
+from torchblocks.data.splits import split_ner_stratified_kfold
+
+'''
+采用多标签方式进行划分数据
+'''
+
+train_file = '../dataset/cner/train.char.bmes'
+dev_file = '../dataset/cner/dev.char.bmes'
+folds = 5
+sentences = []
+lines = []
+for input_file in [train_file, dev_file]:
+    with open(input_file, 'r') as f:
+        words, labels = [], []
+        for line in f:
+            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                if words:
+                    lines.append([words, labels])
+                    words, labels = [], []
+            else:
+                splits = line.split(" ")
+                words.append(splits[0])
+                if len(splits) > 1:
+                    label = splits[-1].replace("\n", "")
+                    if 'M-' in label:
+                        label = label.replace('M-', 'I-')
+                    elif 'E-' in label:
+                        label = label.replace('E-', 'I-')
+                    elif 'S-' in label:  # 去除S标签，主要方便后面做实验
+                        label = "O"
+                    labels.append(label)
+                else:
+                    labels.append("O")
+        if words:
+            lines.append([words, labels])
+
+for i, (words, labels) in enumerate(lines):
+    spans = get_spans_from_bio_tags(labels, id2label=None)
+    new_spans = []
+    for span in spans:
+        tag, start, end = span
+        new_spans.append([tag, start, end + 1, "".join(words[start:(end + 1)])])
+    sentence = {'id': i, 'text': words, 'entities': new_spans, 'bio_seq': labels}
+    sentences.append(sentence)
+
+entities_list = [x['entities'] for x in sentences]
+all_indices = split_ner_stratified_kfold(entities_list, num_folds=5)
+for fold, (train_indices, val_indices) in enumerate(all_indices):
+    print("The number of train examples: ",len(train_indices))
+    print("The number of dev examples: ", len(val_indices))
+    train_data = [sentences[i] for i in train_indices]
+    dev_data = [sentences[i] for i in val_indices]
+    json_to_text(f'../dataset/cner/cner_train_fold{fold}.json', train_data)
+    json_to_text(f'../dataset/cner/cner_dev_fold{fold}.json', dev_data)
+