-
Notifications
You must be signed in to change notification settings - Fork 178
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* feature: add max_samples to limit mixed datasets
- Loading branch information
Showing
5 changed files
with
139 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,6 @@ | ||
{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} | ||
{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} | ||
{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} | ||
{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} | ||
{"text": "Today is Sunday and it's a happy day!", "meta": {"src": "Arxiv", "date": "2023-04-27", "version": "1.0"}} | ||
{"text": "Do you need a cup of coffee?", "meta": {"src": "code", "author": "xxx"}} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
import os | ||
import unittest | ||
|
||
from data_juicer.format.mixture_formatter import MixtureFormatter | ||
|
||
|
||
class MixtureFormatterTest(unittest.TestCase): | ||
|
||
def setUp(self): | ||
self._path = os.path.join(os.path.dirname(os.path.realpath(__file__)), | ||
'data', 'structured') | ||
self._file = os.path.join(self._path, 'demo-dataset.jsonl') | ||
self._file2 = self._file | ||
|
||
def test_only_file(self): | ||
formatter = MixtureFormatter(self._file) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), 6) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_sample_weight(self): | ||
formatter = MixtureFormatter('0.5 ' + self._file) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), 3) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_sample_number(self): | ||
max_samples = 2 | ||
formatter = MixtureFormatter(self._file, max_samples=max_samples) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), max_samples) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_sample_number_weight(self): | ||
max_samples = 2 | ||
formatter = MixtureFormatter('0.5 ' + self._file, max_samples=max_samples) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), max_samples) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_multi_datasets_without_weight(self): | ||
data_path = self._file + ' ' + self._file2 | ||
formatter = MixtureFormatter(data_path) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), 12) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_multi_datasets_with_weight(self): | ||
data_path = self._file + ' ' + self._file2 | ||
formatter = MixtureFormatter(data_path) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), 12) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_multi_datasets_with_one_weight(self): | ||
data_path = '0.5 ' + self._file + ' ' + self._file2 | ||
formatter = MixtureFormatter(data_path) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), 9) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_multi_datasets_with_weight(self): | ||
data_path = '0.5 ' + self._file + ' 0.5 ' + self._file2 | ||
formatter = MixtureFormatter(data_path) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), 6) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
def test_multi_datasets_with_sample(self): | ||
max_samples = 7 | ||
data_path = '0.5 ' + self._file + ' 0.5 ' + self._file2 | ||
formatter = MixtureFormatter(data_path, max_samples=max_samples) | ||
ds = formatter.load_dataset() | ||
self.assertEqual(len(ds), max_samples) | ||
self.assertEqual(list(ds.features.keys()), ['text', 'meta']) | ||
|
||
if __name__ == '__main__': | ||
unittest.main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters