From 06657742e7c9c7ae9fa0ff3c969bbeb4508a79a6 Mon Sep 17 00:00:00 2001 From: Patrick Amadeus Irawan Date: Fri, 31 May 2024 17:19:36 +0700 Subject: [PATCH] New schema: Add `chat` schema (#679) * feat: add chat schema * Update __init__.py add whitespace at EoF * Update chat.py change the `role` field schema from ClassLabel to string --- seacrowd/utils/constants.py | 4 ++++ seacrowd/utils/schemas/__init__.py | 2 ++ seacrowd/utils/schemas/chat.py | 24 ++++++++++++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 seacrowd/utils/schemas/chat.py diff --git a/seacrowd/utils/constants.py b/seacrowd/utils/constants.py index fc7fc5aeb..54efa716b 100644 --- a/seacrowd/utils/constants.py +++ b/seacrowd/utils/constants.py @@ -10,6 +10,7 @@ pairs_features_score, pairs_multi_features, qa_features, + chat_features, image_features, image_multi_features, imqa_features, @@ -106,6 +107,7 @@ class Tasks(Enum): # Multi Text Generation DIALOGUE_SYSTEM = "DS" E2E_TASK_ORIENTED_DIALOGUE = "TOD" + MULTI_TURN_CONVERSATION = "MTC" # Self Supervised & Unsupervised Text PROMPTING = "PRT" @@ -247,6 +249,7 @@ class Licenses(Enum): Tasks.TOKEN_LEVEL_LANGUAGE_IDENTIFICATION: "SEQ_LABEL", Tasks.COMMONSENSE_REASONING: "QA", Tasks.QUESTION_ANSWERING: "QA", + Tasks.MULTI_TURN_CONVERSATION: "CHAT", Tasks.QUESTION_ANSWERING_RETRIEVAL: "QA", Tasks.CONCEPT_ALIGNMENT_CLASSIFICATION: "PAIRS", Tasks.NEXT_SENTENCE_PREDICTION: "PAIRS", @@ -315,6 +318,7 @@ class Licenses(Enum): "KB": kb_features, "TREE": tree_features, "QA": qa_features, + "CHAT": chat_features, "T2T": text2text_features, "TEXT": text_features(), "TEXT_MULTI": text_multi_features(), diff --git a/seacrowd/utils/schemas/__init__.py b/seacrowd/utils/schemas/__init__.py index ec4c035f8..5a30ac568 100644 --- a/seacrowd/utils/schemas/__init__.py +++ b/seacrowd/utils/schemas/__init__.py @@ -5,6 +5,7 @@ from .pairs import features_with_continuous_label as pairs_features_score from .pairs_multilabel import features as pairs_multi_features from .qa import features as qa_features +from .chat import features as chat_features from .image import features as image_features from .image import multi_features as image_multi_features from .imqa import features as imqa_features @@ -28,6 +29,7 @@ "pairs_features_score", "pairs_multi_features", "qa_features", + "chat_features", "image_features", "image_multi_features", "imqa_features", diff --git a/seacrowd/utils/schemas/chat.py b/seacrowd/utils/schemas/chat.py new file mode 100644 index 000000000..6443bb649 --- /dev/null +++ b/seacrowd/utils/schemas/chat.py @@ -0,0 +1,24 @@ +""" +Conversational Chat Schema +""" +import datasets + +features = datasets.Features( + { + "id": datasets.Value("string"), + "input": datasets.Sequence({ + "role": datasets.Value("string"), + "content": datasets.Value("string"), + }), + "output": datasets.Value("string"), + + # the schema of 'meta' aren't specified either to allow some flexibility + "meta": {} + + # notes on how to use this field of 'meta' + # you can choose two of options: + # 1. defining as empty dict if you don't think it's usable in `_generate_examples`, or + # 2. defining meta as dict of key with intended colname meta and its val with dataset.Features class + # in `_info` Dataloader method then populate it with the values in `_general_examples` Dataloader method + } +)