Skip to content

Commit

Permalink
Merge pull request #5 from lonePatient/dev-0.8.1
Browse files Browse the repository at this point in the history
update version_0.8.1
  • Loading branch information
lonePatient committed Mar 4, 2023
2 parents 5e94106 + 5dfaa4a commit 0964e05
Show file tree
Hide file tree
Showing 216 changed files with 11,125 additions and 3,826 deletions.
Binary file modified .DS_Store
Binary file not shown.
Empty file modified LICENSE
100755 → 100644
Empty file.
15 changes: 12 additions & 3 deletions README.md
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,18 @@

A PyTorch-based toolkit for natural language processing

![python](https://img.shields.io/badge/-Python_3.7_%7C_3.8_%7C_3.9_%7C_3.10-blue?logo=python&logoColor=white)
![pytorch](https://img.shields.io/badge/PyTorch_1.10+-ee4c2c?logo=pytorch&logoColor=white)
![black](https://img.shields.io/badge/Code%20Style-Black-black.svg?labelColor=gray)
![license](https://img.shields.io/badge/License-MIT-green.svg?labelColor=gray)

### Requirements

- torch>=1.6.0
- transformers>=4.1.1
- torchmetrics>=0.6.0
- torch>=1.10.0
- tokenizers >= 0.7.0
- transformers>=4.10.0
- torchmetrics>=0.11.3


TorchBlocks requires Python 3.7+. We recommend installing TorchBlocks in a Linux or OSX environment.

Expand All @@ -22,6 +28,7 @@ git clone https://github.com/lonePatient/TorchBlocks.git
cd TorchBlocks
python setup.py install
```
⚠️**Note:** This project is still in the development stage and some of the interfaces are subject to change.

### Tutorials

Expand All @@ -30,5 +37,7 @@ python setup.py install
* Tutorial 3 (sequence labeling): [task_sequence_labeling_ner_crf.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_ner_crf.py)
* Tutorial 4 (sentence similarity): [task_sentence_similarity_lcqmc.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sentence_similarity_lcqmc.py)
* Tutorial 5 (triple similarity): [task_triple_similarity_epidemic.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_triple_similarity_epidemic.py)
* Tutorial 6 (sequence labeling): [task_sequence_labeling_resume_beam_search_softmax.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_resume_beam_search_softmax.py)
* Tutorual 7 (sequence labeling): [task_sequence_labeling_resume_global_pointer.py](https://github.com/lonePatient/TorchBlocks/blob/master/examples/task_sequence_labeling_resume_global_pointer.py)
* Example scripts for each task: [TorchBlocks/examples/](https://github.com/lonePatient/TorchBlocks/tree/master/examples)

File renamed without changes.
4 changes: 4 additions & 0 deletions docs/apex_install
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
$ git clone https://github.com/NVIDIA/apex
$ sed -i "s/or (bare_metal_minor != torch_binary_minor)//g" apex/setup.py
$ pip install -v --disable-pip-version-check --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" apex/
$ rm -rf apex
52 changes: 0 additions & 52 deletions examples/README.md

This file was deleted.

48 changes: 48 additions & 0 deletions examples/ccks_kfold_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from torchblocks.utils import json_to_text
from sklearn.model_selection import StratifiedKFold


def get_data(data_path, datatype):
data = []
if datatype == 'train':
with open(data_path) as f:
for i in f:
dict_txt = eval(i)
if dict_txt['query'] == '':
continue
for j in dict_txt['candidate']:
if j['text'] == '':
continue
data.append({'query': dict_txt['query'], 'candidate': j['text'], 'label': j['label']})
else:
with open(data_path) as f:
for i in f:
dict_txt = eval(i)
for j in dict_txt['candidate']:
data.append({'text_id': dict_txt['text_id'], 'query': dict_txt['query'], 'candidate': j['text']})
return data


def generate_data(train_data, random_state=42):
X = range(len(train_data))
y = [x['label'] for x in train_data]
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=random_state)
for fold, (train_index, dev_index) in enumerate(skf.split(X, y)):
tmp_train_df = [train_data[index] for index in train_index]
tmp_dev_df = [train_data[index] for index in dev_index]
json_to_text(f'../dataset/ccks2021/ccks2021_train_seed{random_state}_fold{fold}.json', tmp_train_df)
json_to_text(f'../dataset/ccks2021/ccks2021_dev_seed{random_state}_fold{fold}.json', tmp_dev_df)


if __name__ == '__main__':
seed = 42
train_path1 = '../dataset/ccks2021/round1_train.txt'
train_path2 = '../dataset/ccks2021/round2_train.txt'
train_data1 = get_data(train_path1, 'train')
train_data2 = get_data(train_path2, 'train')
train_data = train_data1
train_data.extend(train_data2)
generate_data(train_data, 42)
generate_data(train_data, 24)
generate_data(train_data, 33)
print('...............kf finish...........')
57 changes: 57 additions & 0 deletions examples/cner_kfold_split.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from torchblocks.utils import json_to_text
from torchblocks.tasks import get_spans_from_bio_tags
from torchblocks.data.splits import split_ner_stratified_kfold

'''
采用多标签方式进行划分数据
'''

train_file = '../dataset/cner/train.char.bmes'
dev_file = '../dataset/cner/dev.char.bmes'
folds = 5
sentences = []
lines = []
for input_file in [train_file, dev_file]:
with open(input_file, 'r') as f:
words, labels = [], []
for line in f:
if line.startswith("-DOCSTART-") or line == "" or line == "\n":
if words:
lines.append([words, labels])
words, labels = [], []
else:
splits = line.split(" ")
words.append(splits[0])
if len(splits) > 1:
label = splits[-1].replace("\n", "")
if 'M-' in label:
label = label.replace('M-', 'I-')
elif 'E-' in label:
label = label.replace('E-', 'I-')
elif 'S-' in label: # 去除S标签,主要方便后面做实验
label = "O"
labels.append(label)
else:
labels.append("O")
if words:
lines.append([words, labels])

for i, (words, labels) in enumerate(lines):
spans = get_spans_from_bio_tags(labels, id2label=None)
new_spans = []
for span in spans:
tag, start, end = span
new_spans.append([tag, start, end + 1, "".join(words[start:(end + 1)])])
sentence = {'id': i, 'text': words, 'entities': new_spans, 'bio_seq': labels}
sentences.append(sentence)

entities_list = [x['entities'] for x in sentences]
all_indices = split_ner_stratified_kfold(entities_list, num_folds=5)
for fold, (train_indices, val_indices) in enumerate(all_indices):
print("The number of train examples: ",len(train_indices))
print("The number of dev examples: ", len(val_indices))
train_data = [sentences[i] for i in train_indices]
dev_data = [sentences[i] for i in val_indices]
json_to_text(f'../dataset/cner/cner_train_fold{fold}.json', train_data)
json_to_text(f'../dataset/cner/cner_dev_fold{fold}.json', dev_data)

Loading

0 comments on commit 0964e05

Please sign in to comment.