-
Notifications
You must be signed in to change notification settings - Fork 2
/
classes.py
148 lines (114 loc) · 4.55 KB
/
classes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from pathlib import Path
from typing import List, Dict, Union
from dataclasses import dataclass
from abc import ABC
from transformers import PretrainedConfig, GPT2Config, BertConfig, TrainingArguments, Seq2SeqTrainingArguments, \
DataCollator, GenerationConfig
import miditok
from dataset import DatasetABC
import tokenizers_
@dataclass
class DataConfig:
valid_ratio: float
test_ratio: float
min_seq_len: int # used for training
max_seq_len: int
@dataclass
class TestingConfig:
batch_size: int
min_seq_len: int
max_seq_len: int
@dataclass
class TokenizationConfig:
tokenization: str
tokenizer_params: Dict
bpe_vocab_size: int = None
@dataclass
class Baseline(ABC):
"""Represents a baseline.
Need to be overridden to have a create_dataset method.
"""
name: str # bpe or tokenization
exp_name: str # data_tokenization
dataset: str
seed: int
tokenization_config: TokenizationConfig
model_config: Union[PretrainedConfig, GPT2Config, BertConfig]
training_config: TrainingArguments
data_config: DataConfig
test_config: TestingConfig
generation_config: GenerationConfig = None
embed_pooling_size: int = None # as cannot be in BERTConfig / GPT2Config
def __post_init__(self):
tokens_path_dir_name = f"{self.dataset}_{self.tokenization}"
if self.tokenization_config.bpe_vocab_size is not None:
tokens_path_dir_name += f"_bpe{self.tokenization_config.bpe_vocab_size}"
self.tokens_path = Path("data", tokens_path_dir_name)
self.tokenizer = self.create_tokenizer() # created with method below, called by Experiment class
self.training_config.output_dir = str(self.run_path) # override output dir
self.training_config.logging_dir = self.training_config.output_dir # for tensorboard
self.model_config.vocab_size = len(self.tokenizer)
self.model_config.pad = self.pad_token
if isinstance(self.model_config, GPT2Config):
self.model_config.bos_token_id = self.bos_token
self.model_config.eos_token_id = self.eos_token
self.generation_config.pad_token_id = self.pad_token
if isinstance(self.training_config, Seq2SeqTrainingArguments):
self.training_config.generation_config = self.generation_config
def create_tokenizer(self) -> miditok.MIDITokenizer:
try:
_ = getattr(miditok, self.tokenization)
package = miditok
except AttributeError:
package = tokenizers_
try:
tokenizer = getattr(package, self.tokenization)(params=self.tokens_path / 'config.txt')
except FileNotFoundError:
tokenizer = getattr(package, self.tokenization)(**self.tokenization_config.tokenizer_params)
return tokenizer
@property
def tokenization(self) -> str: return self.tokenization_config.tokenization
@property
def run_path(self) -> Path: return Path('runs', self.exp_name, self.name)
def __return_special_token(self, tok: str) -> int:
if self.tokenizer.is_multi_voc:
return self.tokenizer[0, tok]
return self.tokenizer[tok]
@property
def pad_token(self) -> int:
return self.__return_special_token('PAD_None')
@property
def mask_token(self) -> int:
return self.__return_special_token('MASK_None')
@property
def bos_token(self) -> int:
return self.__return_special_token('BOS_None')
@property
def eos_token(self) -> int:
return self.__return_special_token('EOS_None')
@property
def sep_token(self) -> int:
return self.__return_special_token('SEP_None')
@property
def special_tokens(self) -> List[int]:
return [self.pad_token, self.mask_token, self.bos_token, self.eos_token, self.sep_token]
def create_model(self):
raise NotImplementedError
def create_dataset(self, files_paths: List[Path], *args, **kwargs) -> DatasetABC:
raise NotImplementedError
def create_data_collator(self) -> DataCollator:
raise NotImplementedError
def __repr__(self):
return f'{self.name} - {self.tokens_path}'
@dataclass
class Experiment:
name: str # dataset_tokenization
baselines: List[Baseline]
dataset: str
@property
def data_path_midi(self):
return Path('data', self.dataset) # original dataset path, in MIDI
@property
def run_path(self) -> Path: return Path('runs', self.name)
def __str__(self): return f'{self.name} - {len(self.baselines)} baselines'
def __repr__(self): return self.__str__()