Skip to content

Commit

Permalink
Merge pull request #75 from thunlp/dev
Browse files Browse the repository at this point in the history
fix template format & BMInf CPM (partially supported)
  • Loading branch information
Achazwl authored Dec 9, 2021
2 parents 0540fd3 + 7dd506c commit a478a28
Show file tree
Hide file tree
Showing 41 changed files with 2,155 additions and 92 deletions.
4 changes: 3 additions & 1 deletion experiments/relation_classification_ptr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ plm:
freeze_para: False
lr: 0.00003
weight_decay: 0.01
eps: 1.0E-6
adam_epsilon: 1.0E-6
scheduler:
type:
num_warmup_steps: 500
Expand Down Expand Up @@ -43,6 +43,8 @@ ptr_template:
choice: 0
file_path: scripts/RelationClassification/TACRED/ptr_template.txt
optimize:
name: AdamW
adam_epsilon: 1.0e-8
lr: 0.00001

ptr_verbalizer:
Expand Down
11 changes: 11 additions & 0 deletions openprompt/data_utils/ZH/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .closed_QA import *
from .coreference import *
from .entity_typing import *
from .generation import *
from .nli import *
from .paraphrase import *
from .reading_comprehensation import *
from .relation import *
from .sentiment import *
from .summarization import *
from .topic_classification import *
30 changes: 30 additions & 0 deletions openprompt/data_utils/ZH/closed_QA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from .processor import *

class ZhiDao(DataProcessor):
"""
"""
def __init__(self):
super().__init__()

def get_examples(self, data_dir, split):
if split != 'train': raise ValueError
path = os.path.join(data_dir, f"{split}.csv")
with open(path, encoding='utf8') as f:
reader = csv.reader(f, delimiter=",")
for i, row in enumerate(reader):
if i==0: continue
title, question, reply, is_best = row
if not is_best: continue
example = InputExample(
meta = {
"title": title,
"question": question,
},
tgt_text = reply,
)
examples.append(example)

def get_templates(self):
return [
'问题:{title} {question} 回答:',
]
81 changes: 81 additions & 0 deletions openprompt/data_utils/ZH/coreference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from .processor import *

class Mandarinograd(CLSProcessor):
"""
@inproceedings{bernard2020mandarinograd,
title={Mandarinograd: A Chinese Collection of Winograd Schemas},
author={Bernard, Timoth{\'e}e and Han, Ting},
booktitle={Proceedings of The 12th Language Resources and Evaluation Conference},
pages={21--26},
year={2020}
}
"""
def __init__(self):
super().__init__(
labels_origin = [
"0", "1",
],
labels_mapped = [
"不可以", "可以",
]
)

def get_examples(self, data_dir, split):
path = os.path.join(data_dir, f"{split}.json")

with open(path, encoding='utf8') as f:
for example_json in json.load(f).values():
example = InputExample(
meta = {
"before": example_json["text"],
"after": example_json["hypothesis"],
"options": self.labels_mapped,
},
tgt_text = self.get_label(example_json["class_id"]),
)
examples.append(example)



def get_templates(self):
return [
'文本:{before} 问题:上述文本进行指代消解后,可以理解为 "{after}" 吗?{options}',
]


class CLUEWSC2020(CLSProcessor):
"""
"""
def __init__(self):
super().__init__(
labels_origin = [
"false", "true",
],
labels_mapped = [
"否", "是",
]
)

def get_examples(self, data_dir, split):
path = os.path.join(data_dir, f"{split}.json")

with open(path, encoding='utf8') as f:
for line in f:
example_json = json.loads(line)
example = InputExample(
meta = {
"that": example_json["target"]["span1_text"],
"it": example_json["target"]["span2_text"],
"text": example_json["text"],
"options": self.labels_mapped,
},
tgt_text = self.get_label(example_json["label"]),
)
examples.append(example)



def get_templates(self):
return [
'文本:{text} 问题:在上文中,“{it}“ 指代 "{that}" 吗?{options}',
]
214 changes: 214 additions & 0 deletions openprompt/data_utils/ZH/entity_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
from .processor import *

class CMeEE_NER(CLSProcessor):
"""
@inproceedings{hongying2020building,
title={Building a Pediatric Medical Corpus: Word Segmentation and Named Entity Annotation},
author={Hongying, Zan and Wenxin, Li and Kunli, Zhang and Yajuan, Ye and Baobao, Chang and Zhifang, Sui},
booktitle={Workshop on Chinese Lexical Semantics},
pages={652--664},
year={2020},
organization={Springer}
}
"""
def __init__(self):
super().__init__(
labels_origin = [
"dis", "sym", "dru", "equ", "pro", "bod", "ite", "mic", "dep",
],
labels_mapped = [
"疾病", "临床表现", "药物", "医疗设备", "医疗程序", "身体", "医学检验项目", "微生物类", "科室",
]
)

def get_examples(self, data_dir, split):
path = os.path.join(data_dir, f"{split}.json")
examples = []
with open(path, encoding='utf8') as f:
for example_json in json.load(f):
for span in example_json["entities"]:

example = InputExample(
meta = {
"context": example_json["text"],
"entity": span["entity"],
"options": self.labels_mapped,
},
label = self.get_label(span["type"])
)
examples.append(example)
return examples



def get_templates(self):
return [
'文本:{context} 问题:上文中,实体“{entity}”是什么类型的? {options}',
]


class Resume_NER(CLSProcessor):
"""
@article{zhang2018chinese,
title={Chinese NER Using Lattice LSTM},
author={Yue Zhang and Jie Yang},
booktitle={Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (ACL)},
year={2018}
}
"""
def __init__(self):
super().__init__(
labels_origin = [
"NAME", "CONT", "LOC", "RACE", "PRO", "EDU", "ORG", "TITLE",
],
labels_mapped = [
"人名", "国籍", "籍贯", "种族", "专业", "学位", "机构", "职称",
]
)

def get_examples(self, data_dir, split):
path = os.path.join(data_dir, f"{split}.jsonline")

with open(path, encoding='utf8') as f:
for line in f:
example_json = json.loads(line)
for span in example_json["span_list"]:
example = InputExample(
meta = {
"context": "".join(example_json["tokens"]),
"entity": "".join(example_json["tokens"][span["start"]: span["end"]+1]),
"options": self.labels_mapped,
},
tgt_text = self.get_label(span["type"]),
)
examples.append(example)



def get_templates(self):
return [
'文本:{context} 问题:上文中,实体“{entity}”是什么类型的? {options}',
]


class Weibo_NER(CLSProcessor):
"""
@inproceedings{peng2015ner,
title={Named Entity Recognition for Chinese Social Media with Jointly Trained Embeddings},
author={Peng, Nanyun and Dredze, Mark},
booktitle={Processings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
pages={548–-554},
year={2015}, File={https://www.aclweb.org/anthology/D15-1064/}, }
@inproceedings{peng2016improving,
title={Improving named entity recognition for Chinese social media with word segmentation representation learning},
author={Peng, Nanyun and Dredze, Mark},
booktitle={Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (ACL)},
volume={2},
pages={149--155},
year={2016}, File={https://www.aclweb.org/anthology/P16-2025/}, }
@inproceedings{he-sun-2017-f,
title = "{F}-Score Driven Max Margin Neural Network for Named Entity Recognition in {C}hinese Social Media",
author = "He, Hangfeng and
Sun, Xu",
booktitle = "Proceedings of the 15th Conference of the {E}uropean Chapter of the Association for Computational Linguistics: Volume 2, Short Papers",
month = apr,
year = "2017",
address = "Valencia, Spain",
publisher = "Association for Computational Linguistics",
url = "https://www.aclweb.org/anthology/E17-2113",
pages = "713--718",
}
"""
def __init__(self):
super().__init__(
labels_origin = [ # TODO 区分 NAM 特指 NOM 泛指
"PER.NAM", "PER.NOM", "LOC.NAM", "LOC.NOM", "ORG.NAM", "ORG.NOM", "GPE.NAM", "GPE.NOM",
],
labels_mapped = [
"人", "泛指人", "地点", "泛指地点", "机构", "泛指机构", "地理政治实体", "泛指地理政治实体",
]
)

def get_examples(self, data_dir, split):
path = os.path.join(data_dir, f"{split}.jsonline")

with open(path, encoding='utf8') as f:
for line in f:
example_json = json.loads(line)
for span in example_json["span_list"]:
example = InputExample(
meta = {
"context": "".join(example_json["tokens"]),
"entity": "".join(example_json["tokens"][span["start"]: span["end"]+1]),
"options": self.labels_mapped,
},
tgt_text = self.get_label(span["type"]),
)
examples.append(example)



def get_templates(self):
return [
'文本:{context} 问题:上文中,实体“{entity}”是什么类型的? {options}',
]


class DH_MSRA(CLSProcessor):
"""
"""
def __init__(self):
super().__init__(
labels_origin = [
"PER", "LOC", "ORG"
],
labels_mapped = [
"人", "地点", "机构"
]
)

def get_examples(self, data_dir, split):
if split != 'train': raise ValueError
path = os.path.join(data_dir, f"{split}.txt")

with open(path, encoding='utf8') as f:
xs, ys = [], []
for line in f:
l = line.split()
if len(l)==0:
i = 0
while i < len(xs):
if ys[i][0] == 'B':
j = i + 1
while j < len(xs):
if ys[j][0] == 'O': break
j = j + 1

example = InputExample(
meta = {
"context": "".join(xs),
"entity": "".join(xs[i:j]),
"options": self.labels_mapped,
},
tgt_text = self.get_label(ys[i][2:]),
)
examples.append(example)

i = j
else:
i = i + 1

xs, ys = [], []
else:
xs.append(l[0])
ys.append(l[1])




def get_templates(self):
return [
'文本:{context} 问题:上文中,实体“{entity}”是什么类型的? {options}',
]
Loading

0 comments on commit a478a28

Please sign in to comment.