-
Notifications
You must be signed in to change notification settings - Fork 4
/
datasets.txt
423 lines (306 loc) · 9.9 KB
/
datasets.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
laion/OIG
tatsu-lab/alpaca
fka/awesome-chatgpt-prompts
Anthropic/hh-rlhf
sahil2801/CodeAlpaca-20k
JosephusCheung/GuanacoDataset
nomic-ai/gpt4all_prompt_generations
https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM
# openai/summarize_from_feedback
# openai/webgpt_comparisons
# Simontwice/premise_selection_in_isabelle
hoskinson-center/proofnet
ehartford/oa_leet10k
ehartford/leet10k-alpaca
QingyiSi/Alpaca-CoT
https://github.com/Nan-Do/LeetCodeContestsDataset
Muennighoff/flan
BAAI/COIG
# JavaFXpert/gpt-math-techniques
gsm8k
MU-NLPC/Calc-aqua_rat
MU-NLPC/Calc-gsm8k
reasoning-machines/gsm-hard
https://github.com/reasoning-machines/pal.git
anon8231489123/ShareGPT_Vicuna_unfiltered
# openai_humaneval
https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz
Muennighoff/flan
HuggingFaceM4/COCO
https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
http://images.cocodataset.org/zips/train2014.zip
https://github.com/ymcui/Chinese-LLaMA-Alpaca/raw/main/data/alpaca_data_zh_51k.json
akoksal/LongForm
liuhaotian/LLaVA-Instruct-150K
liuhaotian/LLaVA-CC3M-Pretrain-595K
fnlp/moss-002-sft-data
databricks/databricks-dolly-15k
# https://github.com/anthropics/evals
Anthropic/model-written-evals
https://github.com/microsoft/AGIEval
https://github.com/EleutherAI/lm-evaluation-harness
https://github.com/thisserand/alpaca-lora-finetune-language
https://github.com/loubnabnl/bloom-code-evaluation
yahma/alpaca-cleaned
RyokoAI/ShareGPT52K
YeungNLP/firefly-train-1.1M
BelleGroup/train_2M_CN
BelleGroup/multiturn_chat_0.8M
BelleGroup/school_math_0.25M
BelleGroup/generated_chat_0.4M
https://github.com/yizhongw/self-instruct
https://github.com/lamini-ai/lamini
https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines
https://github.com/thu-coai/Safety-Prompts
https://github.com/csitfun/LogiQA2.0
https://github.com/nlpdata/c3
https://github.com/nlpdata/dream
https://github.com/terryyz/llm-code-eval
ehartford/WizardLM_alpaca_evol_instruct_70k_unfiltered
nomic-ai/gpt4all-j-prompt-generations
bigcode/ta-prompt
MBZUAI/LaMini-instruction
OpenAssistant/oasst1
junelee/wizard_vicuna_70k
ehartford/wizard_vicuna_70k_unfiltered
hoskinson-center/minif2f-lean4
mosaicml/dolly_hhrlhf
MBZUAI/Bactrian-X
teknium/GPT4-LLM-Cleaned
spdenisov/tatoeba
spdenisov/udt_alpaca
spdenisov/enwiktionary
0x22almostEvil/multilingual-wikihow-qa-16k
0x22almostEvil/reasoning-gsm-qna-oa
0x22almostEvil/reasoning_bg_oa
0x22almostEvil/tatoeba-mt-qna-oa
clips/20Q
allenai/soda
victor123/evol_instruct_70k
cryscan/multilingual-share
stanfordnlp/SHP
wangrui6/Zhihu-KOL
liyucheng/zhihu_rlhf_3k
liyucheng/zhihu_26k
PKU-Alignment/PKU-SafeRLHF-10K
allenai/prosocial-dialog
roneneldan/TinyStories
sambanovasystems/xOA22
sambanovasystems/x-self-instruct-seed-32
IlyaGusev/gpt_roleplay_realm
iamketan25/roleplay-instructions-dataset
huggingface-tools/default-endpoints
ashiyakatuka11/empathetic_dialogues_context
Abirate/english_quotes
openaccess-ai-collective/oasst1-guanaco-extended
winddude/reddit_finance_43_250k
https://github.com/lupantech/PromptPG
openllmplayground/pandagpt_visual_instruction_dataset
timdettmers/openassistant-guanaco
ehartford/samantha-data
ehartford/based
renumics/cifar100-enriched
kaiokendev/SuperCOT-dataset
tiedong/goat
nomic-ai/gpt4all_prompt_generations_with_p3
P1ayer-1/chatgpt-conversations-chatlogs.net
achang/plot_qa
yankscally/midiset
sileod/mindgames
spdenisov/wsd_semcor
code_x_glue_ct_code_to_text
GaussianMixture/oasst_alpaca_sharegpt_dataset
shibing624/medical
BelleGroup/train_3.5M_CN
shibing624/alpaca-zh
Chinese-Vicuna/guanaco_belle_merge_v1.0
FreedomIntelligence/HuatuoGPT-sft-data-v1
philschmid/sharegpt-raw
Hello-SimpleAI/HC3
teknium/GPTeacher-General-Instruct
metaeval/ScienceQA_text_only
hellaswag
riddle_sense
camel-ai/math
camel-ai/biology
camel-ai/physics
camel-ai/chemistry
winglian/evals
ewof/code-alpaca-instruct-unfiltered
ewof/code-alpaca-instruct-unfiltered
64bits/lex_fridman_podcast_for_llm_vicuna
https://github.com/GJBroughton/Star_Trek_Scripts
https://github.com/shibing624/MedicalGPT
tasksource/oasst1_pairwise_rlhf_reward
Dahoas/full-hh-rlhf
Dahoas/static-hh
Dahoas/rm-static
liswei/rm-static-zhTW
yitingxie/rlhf-reward-datasets
# flan/v2/*data
https://github.com/google-research/FLAN
conceptofmind/flan2021_submix_original
conceptofmind/t0_submix_original
conceptofmind/niv2_submix_original
conceptofmind/cot_submix_original
conceptofmind/dialog_submix_original
https://github.com/google-research-datasets/Taskmaster
https://github.com/OFA-Sys/ExpertLLaMA
https://github.com/ziliwangnlp/RefGPT
Mutonix/RefGPT-Fact
Mutonix/RefGPT-Code-ds
Mutonix/RefGPT-Code-cr
Mutonix/RefGPT-Code-bg
PocketDoc/Alpaca_Evol_Instruct_Cleaned
GAIR/lima
PocketDoc/DansPileOfSets
WizardLM/WizardLM_evol_instruct_V2_196k
Alignment-Lab-AI/AILabAssistant
tasksource/tasksource-instruct-v0
tasksource/zero-shot-label-nli
tasksource/icl-symbol-tuning-instruct
vietgpt/OIG_mathqa_flanv2_en
neural_code_search
neulab/conala
reshinthadith/synthetic_program_synthesis_python_1M
fiveflow/cot_ranking
squad_adversarial
bigscience-data/roots_zh-cn_wikipedia
knkarthick/dialogsum
jondurbin/rosettacode-raw
winddude/IHOPv01
jondurbin/rosettacode-10
Norquinal/claude_multi_instruct_1k
Open-Orca/OpenOrca
ehartford/dolphin
CheshireAI/guanaco-unchained
Salesforce/dialogstudio
theblackcat102/evol-codealpaca-v1
nickrosh/Evol-Instruct-Code-80k-v1
P1ayer-1/books-3
P1ayer-1/college_textbooks
P1ayer-1/books-3-textbooks
P1ayer-1/crash_course_subs
P1ayer-1/stack-exchange-preferences-code-v2
goendalf666/sql-chat-instructions
wenhu/TheoremQA
lukaemon/bbh
dmayhem93/agieval-lsat-ar
lmsys/chatbot_arena_conversations
shahules786/orca-chat
CarperAI/openai_summarize_comparisons
declare-lab/InstructEvalImpact
nampdn-ai/tiny-codes
declare-lab/flan-mini
allenai/peS2o
LinkSoul/instruction_merge_set
jondurbin/airoboros-gpt4-m2.0
rombodawg/MegaCodeTraining200k
causalnlp/corr2cause
ehartford/WizardLM_evol_instruct_V2_196k_unfiltered_merged_split
ehartford/open-instruct-uncensored
stingning/ultrachat
# 14 GB
ArmelR/stack-exchange-instruction
https://github.com/HKUNLP/DS-1000
https://github.com/project-baize/baize-chatbot
https://github.com/leanprover/lean4-samples
https://github.com/microsoft/promptbench
https://github.com/llm-attacks/llm-attacks
https://github.com/lz1oceani/verify_cot
https://github.com/Lichang-Chen/InstructZero
https://github.com/salesforce/factualNLG
# huge extra downloads, TODO subreddit
https://github.com/CornellNLP/ConvoKit
# huge extra downloads and mostly dup
https://github.com/allenai/open-instruct
# has extra download
https://github.com/wellecks/naturalproofs
https://github.com/brightmart/nlp_chinese_corpus
https://github.com/FranxYao/chain-of-thought-hub
https://github.com/Troyanovsky/Local-LLM-comparison
# https://people.eecs.berkeley.edu/~hendrycks/MATH.tar
# https://drive.google.com/open?id=1hQsua3TkpEmcJD_UWQx8dmNdEZPyxw23&authuser=0
https://github.com/hendrycks/math/
# https://s3.amazonaws.com/datasets.huggingface.co/scientific_papers/1.1.1/arxiv-dataset.zip
scientific_papers
# for now:
# https://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/xml/en.zip
open_subtitles
# also https://cloud.tsinghua.edu.cn/d/a5a16f2381e7439eb475/
https://github.com/thu-coai/LOT-LongLM
https://github.com/StevenGrove/GPT4Tools
https://github.com/HKUNLP/UnifiedSKG
https://github.com/RUCAIBox/StructGPT
# download_raw_data.sh + download_preprocessed_data.sh
https://github.com/michiyasunaga/DrRepair
# https://cloud.google.com/sdk/docs/install-sdk#linux
# gsutil -m cp -r gs://dm-code_contests local_location
https://github.com/deepmind/code_contests
# use gdown to download the dataset (~4G) and notebooks (~8G)
https://github.com/rajasagashe/JuICe
# https://github.com/InternLM/opencompass/releases/download/0.1.1/OpenCompassData.zip
https://github.com/InternLM/opencompass
https://github.com/lean-dojo/ReProver
# Lean 3: https://zenodo.org/record/8016386 by https://github.com/lean-dojo/LeanDojo/blob/main/scripts/generate-benchmark-lean3.ipynb
# Lean 4: https://zenodo.org/record/8040110 by https://github.com/lean-dojo/LeanDojo/blob/main/scripts/generate-benchmark-lean4.ipynb
https://github.com/lean-dojo/LeanDojo
# should have filted
Helsinki-NLP/tatoeba_mt
# partial download
allenai/objaverse
# huge 'data/*'
tiiuae/falcon-refinedweb
# TODO: partial download
NTU-NLP-sg/xCodeEval
# huge download
# '*'
RyokoAI/CNNovel125K
RyokoAI/Fandom23K
RyokoAI/ScribbleHub17K
RyokoAI/Honeyfeed3600
# '*'
roneneldan/TinyStories
# selective download
# 'data/agda/*' 'data/coq/*' 'data/c2hs-haskell/*' 'data/f-sharp/*' 'data/idris/*' 'data/isabelle/*' 'data/julia/*' 'data/kotlin/*' 'data/lean/*' 'data/literate-agda/*' 'data/literate-haskell/*' 'data/markdown/*' 'data/mathematica/*' 'data/prolog/*' 'data/restructuredtext/*' 'data/rust/*' 'data/sage/*' 'data/tex/*'
bigcode/the-stack
# wiki.jsonl book.jsonl filtered_08cdfa755e6d4d89b673d5bd1acee5f6.sampled.jsonl arxiv_*.jsonl
togethercomputer/RedPajama-Data-1T
# 'preprocessed/adult/*' 'preprocessed/chain_of_thought/*' 'preprocessed/conversation/*' 'preprocessed/instruct/*' 'preprocessed/knowledge/*' 'preprocessed/rlhf/*' 'preprocessed/summarisation/*' 'preprocessed/system/*'
m8than/normalised_chatml_rwkvready
#
tiiuae/falcon-refinedweb
# TODO: selective download
# 'en/*' 'zh/*'
bigscience/xP3
# TODO: need extra download
allenai/lila
https://github.com/fighting41love/funNLP
https://github.com/X-PLUG/mPLUG-Owl
# TODO: too big, to foundational, need to filter and dedup
# ~3G
# SirNeural/flan_v2
# ~85G
# conceptofmind/flan_dialog_submix
# 7GB
# ccdv/arxiv-summarization
# 311GB
# bigcode/starcoderdata
# ~90GB
# MMInstruction/M3IT
# 204GB
# MMInstruction/M3IT-80
# https://huggingface.co/datasets/allenai/c4/tree/mC4_3.1.0/multilingual
# allenai/c4
# allenai/nllb
# oscar-corpus/OSCAR-2201
# sil-ai/bloom-lm
# EleutherAI/the_pile
# EleutherAI/the_pile_deduplicated
# bigscience/bloomz
# bigscience/evaluation-results
# c4
# wikipedia
# the_pile_books3
# pg19
# TODO