-
Notifications
You must be signed in to change notification settings - Fork 4
/
cortexsubsetloader.py
339 lines (317 loc) · 12.3 KB
/
cortexsubsetloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# The MIT License (MIT)
# Copyright © 2023 Yuma Rao
# Copyright © 2023 const
# Copyright (c) 2023 Opentensor
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the “Software”), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software,
# and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of
# the Software.
# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
# THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
import time
import typing
import random
import sys
import wandb
import numpy as np
from tqdm import tqdm
import torch
from torch.utils.data import IterableDataset
from transformers import PreTrainedTokenizerBase
# CORTEX_WANDB_PROJECT = "cortex-t/multi-modality"
# CORTEX_MAX_UIDS = 256
# CORTEX_WANDB_TYPE = "validator"
# validator eval sequence length.
# sequence_length = 2048
UNWANTED_PHRASES = [
"text-based AI language model",
"please refrain",
"it is never okay",
"It is important to",
"It's important to",
"real-world consequences",
"responsible AI",
"AI principles",
"AI assistant",
"an AI language",
"as a language model",
"as an AI language model",
"As a large language model",
"As an AI",
"ethical principles",
"it is not appropriate",
"it's not appropriate",
"I cannot fulfill your request",
"ethical guidelines",
"my guidelines",
"prioritize user safety",
"cannot provide guidance",
"cannot provide information",
"unable to offer assistance",
"cannot engage in discussions",
"programming prohibits",
"follow ethical guidelines",
"cannot support or promote",
"against my programming",
"not able to provide",
"cannot provide any information",
"an AI language model you don't have",
"As an AI language model, I cannot",
"As an AI language model, I do not",
"As an AI language model, I am not able",
"As an AI language model, I don't have personal",
"I am an AI language model and do not",
"However, it is important to use any code or information provided responsibly and within legal and ethical boundaries.",
"As an AI language model, I don't have",
"As an AI language model, I am only able",
"AI language model and I do not",
"As an AI language model, I cannot modify",
"As an AI language model, I do not",
"I know as an AI language model you don't have",
"as an AI language model, you cannot",
"I'm sorry, but as an AI language model",
"As an AI language model, I don't have",
"Unfortunately, I cannot provide",
"I'm sorry, I cannot",
"I'm sorry, I cannot generate",
"AI cannot create or program",
"I'm afraid I cannot create",
"I cannot assist",
"I'm sorry,",
"I'm an AI" ,
"I am an AI",
"my purpose",
"entertainment purposes",
"purely hypothetical",
"not a human",
"I am an AI",
"cannot provide",
"can't provide",
"won't provide",
"not provide",
"a language model",
"As a machine",
"I don't have the ability",
"I am here to assist",
"my purpose is to ",
"my knowledge cutoff",
"my knowledge cut off",
"September 2021",
"I apologize, but",
"It is not possible",
"Please note",
"not acceptable",
"*This chat conversation is shared from",
"*This conversation is shared from",
"<|endoftext|>",
"Я разработчик",
"I'm sorry, I cannot",
"breach of",
"privacy policy",
"I am programmed to",
"As a helpful assistant",
"I don't have beliefs",
"I don't have personal",
"I don't have a personal",
"I don't have emotions",
"I don't have the ability to feel",
"I don't have a physical",
"I don't have physical",
"I don't have the ability to remember",
"I don't have access to real-time",
"I don't have sensors or a physical body",
"I don't have sensory input",
"I don't have a sense",
"I don't have the capability to perceive",
"I don't have the capability to feel",
"I am an artificial intelligence",
"I don't have access to real-time",
"I don't have beliefs or disagreements",
"I do not have a sense of",
"I do not have beliefs",
"I do not have personal",
"I do not have a personal",
"I do not have emotions",
"I do not have the ability to feel",
"I do not have a physical",
"I do not have physical",
"I do not have the ability to remember",
"I do not have access to real-time",
"I do not have sensors or a physical body",
"I do not have sensory input",
"I do not have a sense",
"I do not have the capability to perceive",
"I do not have the capability to feel",
"I am an artificial intelligence",
"I do not have access to real-time",
"I do not have beliefs or disagreements",
"I do not have a sense of",
"September 2021",
"as a language model",
"ethical guidelines",
"as an AI language model",
"my guidelines",
"As an AI",
"cannot provide guidance",
"cannot provide information",
"unable to offer assistance",
"cannot engage in discussions",
"programming prohibits",
"cannot support or promote",
"activities that could harm",
"against my programming",
"activities that could undermine",
"not within the scope",
"designed to prioritize safety",
"not able to provide",
"maintain user safety",
"adhere to safety guidelines",
"dangerous or harmful",
"cannot provide any information",
"focus on promoting safety",
"maintain user safety",
"focus on promoting safety",
"it is never okay",
"September 2021",
"as a language model",
"ethical guidelines",
"as an AI language model",
"my guidelines",
"As an AI",
"prioritize user safety",
"adhere to ethical guidelines",
"promote safety",
"responsible information sharing",
"jeopardize the safety",
"safe information",
"cannot provide guidance",
"cannot provide information",
"unable to offer assistance",
"cannot engage in discussions",
"programming prohibits",
"prioritize safety",
"cannot support or promote",
"activities that could harm",
"against my programming",
"potentially dangerous",
"not within the scope",
"not able to provide",
"cannot provide any information",
"I don't have beliefs"
"I don't have personal"
"gpt",
"gpT",
"gPt",
"Gpt",
"gPT",
"GpT",
"GPt",
"GPT",
"gpt"
]
class CortexSubsetLoader(IterableDataset):
def __init__(self, latest=True, random_seed: typing.Optional[int] = None,
max_samples=1000, steps: typing.Optional[int]=1, progress=False, retry_delay=60,
retry_limit=10, page_size=100, running: typing.Optional[bool]=False,
cortex_project="cortex-t/multi-modality",
cortex_type="validator", silent=False, ignore_list=[], dedup=True):
api = wandb.Api(timeout=100)
if random_seed is None:
random_seed = random.randint(0, sys.maxsize)
filters = [
{ "config.type": cortex_type }
]
if running:
filters.append( {"state": "running"} )
runs = api.runs(cortex_project, filters={"$and": filters})
retry_delay = 5 # Seconds to wait between retries
attempt = 0
generator = np.random.default_rng(seed=random_seed) if random_seed else None
while attempt < retry_limit:
try:
run_order = list(range(len(runs)))
if generator is not None:
generator.shuffle(run_order)
self.buffer: typing.List[typing.Tuple[str, str]] = []
self.selected_runs: typing.List[int] = []
for run_index in tqdm(run_order, desc="Run", leave=False, disable=not progress):
run = runs[run_index]
self.selected_runs.append(run_index)
if latest:
last_step: int = run.lastHistoryStep
elif generator is not None:
last_step = int(generator.random() * run.lastHistoryStep)
else:
last_step = 0
max_step = last_step + 1
min_step = max(0, max_step - steps) if steps is not None else 0
history_scan = run.scan_history(min_step=min_step, max_step=max_step, page_size=page_size)
while True:
try:
sample = next(history_scan)
for uid in range(256):
try:
prompt: typing.Optional[str] = sample[f"prompts.{uid}"]
response: typing.Optional[str] = sample[f"responses.{uid}"]
if isinstance(prompt, str) and isinstance(response, str):
prompt = prompt.strip()
response = response.strip()
if len(prompt) > 0 and len(response) > 0:
if not any(x in response for x in UNWANTED_PHRASES):
if response not in ignore_list:
self.buffer.append((prompt, response))
if dedup:
ignore_list.append(response)
if len(self.buffer) == max_samples:
return
except KeyError:
pass
except StopIteration:
break
# bt.logging.warning(f"Did not collect {max_samples}, only got {len(self.buffer)}")
if not silent:
print(f"Did not collect {max_samples}, only got {len(self.buffer)}")
return
except:
attempt += 1
# bt.logging.warning(
# f"Failed to fetch data, retrying. Attempt {attempt}/{retry_limit}"
# )
print(f"Failed to fetch data, retrying. Attempt {attempt}/{retry_limit}")
if attempt < retry_limit:
time.sleep(retry_delay) # Wait before the next retry
else:
# bt.logging.error(
# "Maximum retry limit reached. Unable to fetch data."
# )
print("Maximum retry limit reached. Unable to fetch data.")
raise
def tokenize(self, tokenizer: PreTrainedTokenizerBase, ext_data=None):
batches = []
if ext_data is not None:
if type(ext_data[0]) is dict:
ext_data = [x.items() for x in ext_data]
data = ext_data
else:
data = self.buffer
for prompt, response in data:
conversation = [
{"role": "user", "content": prompt},
{"role": "assistant", "content": response}
]
prompt_ids = tokenizer.apply_chat_template(
[conversation[0]], truncation=True, max_length=2048,
add_generation_prompt=True
)
ids = tokenizer.apply_chat_template(
conversation, truncation=True, max_length=2048,
)
batches.append((torch.stack([torch.tensor(ids)]), len(prompt_ids)))
return batches
def __iter__(self):
return self.buffer.__iter__()