datasets.txt

laion/OIG

tatsu-lab/alpaca

fka/awesome-chatgpt-prompts

Anthropic/hh-rlhf

sahil2801/CodeAlpaca-20k

JosephusCheung/GuanacoDataset

nomic-ai/gpt4all_prompt_generations

https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM

# openai/summarize_from_feedback
# openai/webgpt_comparisons

# Simontwice/premise_selection_in_isabelle
hoskinson-center/proofnet

ehartford/oa_leet10k
ehartford/leet10k-alpaca

QingyiSi/Alpaca-CoT

https://github.com/Nan-Do/LeetCodeContestsDataset

Muennighoff/flan

BAAI/COIG

# JavaFXpert/gpt-math-techniques
gsm8k
MU-NLPC/Calc-aqua_rat
MU-NLPC/Calc-gsm8k
reasoning-machines/gsm-hard
https://github.com/reasoning-machines/pal.git

anon8231489123/ShareGPT_Vicuna_unfiltered

# openai_humaneval
https://raw.githubusercontent.com/openai/human-eval/master/data/HumanEval.jsonl.gz

Muennighoff/flan

HuggingFaceM4/COCO
https://cs.stanford.edu/people/karpathy/deepimagesent/caption_datasets.zip
http://images.cocodataset.org/zips/train2014.zip
https://github.com/ymcui/Chinese-LLaMA-Alpaca/raw/main/data/alpaca_data_zh_51k.json

akoksal/LongForm

liuhaotian/LLaVA-Instruct-150K
liuhaotian/LLaVA-CC3M-Pretrain-595K

fnlp/moss-002-sft-data

databricks/databricks-dolly-15k

# https://github.com/anthropics/evals
Anthropic/model-written-evals

https://github.com/microsoft/AGIEval

https://github.com/EleutherAI/lm-evaluation-harness

https://github.com/thisserand/alpaca-lora-finetune-language

https://github.com/loubnabnl/bloom-code-evaluation

yahma/alpaca-cleaned
RyokoAI/ShareGPT52K

YeungNLP/firefly-train-1.1M

BelleGroup/train_2M_CN
BelleGroup/multiturn_chat_0.8M
BelleGroup/school_math_0.25M
BelleGroup/generated_chat_0.4M

https://github.com/yizhongw/self-instruct
https://github.com/lamini-ai/lamini
https://github.com/nelson-liu/evaluating-verifiability-in-generative-search-engines

https://github.com/thu-coai/Safety-Prompts

https://github.com/csitfun/LogiQA2.0

https://github.com/nlpdata/c3
https://github.com/nlpdata/dream

https://github.com/terryyz/llm-code-eval

ehartford/WizardLM_alpaca_evol_instruct_70k_unfiltered
nomic-ai/gpt4all-j-prompt-generations
bigcode/ta-prompt
MBZUAI/LaMini-instruction
OpenAssistant/oasst1

junelee/wizard_vicuna_70k
ehartford/wizard_vicuna_70k_unfiltered
hoskinson-center/minif2f-lean4

mosaicml/dolly_hhrlhf
MBZUAI/Bactrian-X
teknium/GPT4-LLM-Cleaned
spdenisov/tatoeba
spdenisov/udt_alpaca
spdenisov/enwiktionary
0x22almostEvil/multilingual-wikihow-qa-16k
0x22almostEvil/reasoning-gsm-qna-oa
0x22almostEvil/reasoning_bg_oa
0x22almostEvil/tatoeba-mt-qna-oa
clips/20Q
allenai/soda
victor123/evol_instruct_70k
cryscan/multilingual-share
stanfordnlp/SHP

wangrui6/Zhihu-KOL
liyucheng/zhihu_rlhf_3k
liyucheng/zhihu_26k

PKU-Alignment/PKU-SafeRLHF-10K

allenai/prosocial-dialog
roneneldan/TinyStories
sambanovasystems/xOA22
sambanovasystems/x-self-instruct-seed-32
IlyaGusev/gpt_roleplay_realm
iamketan25/roleplay-instructions-dataset
huggingface-tools/default-endpoints
ashiyakatuka11/empathetic_dialogues_context

Abirate/english_quotes
openaccess-ai-collective/oasst1-guanaco-extended
winddude/reddit_finance_43_250k

https://github.com/lupantech/PromptPG
openllmplayground/pandagpt_visual_instruction_dataset

timdettmers/openassistant-guanaco
ehartford/samantha-data
ehartford/based
renumics/cifar100-enriched
kaiokendev/SuperCOT-dataset
tiedong/goat
nomic-ai/gpt4all_prompt_generations_with_p3
P1ayer-1/chatgpt-conversations-chatlogs.net
achang/plot_qa
yankscally/midiset
sileod/mindgames
spdenisov/wsd_semcor
code_x_glue_ct_code_to_text
GaussianMixture/oasst_alpaca_sharegpt_dataset
shibing624/medical
BelleGroup/train_3.5M_CN
shibing624/alpaca-zh
Chinese-Vicuna/guanaco_belle_merge_v1.0
FreedomIntelligence/HuatuoGPT-sft-data-v1
philschmid/sharegpt-raw
Hello-SimpleAI/HC3
teknium/GPTeacher-General-Instruct
metaeval/ScienceQA_text_only
hellaswag
riddle_sense
camel-ai/math
camel-ai/biology
camel-ai/physics
camel-ai/chemistry
winglian/evals
ewof/code-alpaca-instruct-unfiltered

ewof/code-alpaca-instruct-unfiltered
64bits/lex_fridman_podcast_for_llm_vicuna

https://github.com/GJBroughton/Star_Trek_Scripts

https://github.com/shibing624/MedicalGPT
tasksource/oasst1_pairwise_rlhf_reward
Dahoas/full-hh-rlhf
Dahoas/static-hh
Dahoas/rm-static
liswei/rm-static-zhTW
yitingxie/rlhf-reward-datasets

# flan/v2/*data
https://github.com/google-research/FLAN
conceptofmind/flan2021_submix_original
conceptofmind/t0_submix_original
conceptofmind/niv2_submix_original
conceptofmind/cot_submix_original
conceptofmind/dialog_submix_original

https://github.com/google-research-datasets/Taskmaster
https://github.com/OFA-Sys/ExpertLLaMA

https://github.com/ziliwangnlp/RefGPT
Mutonix/RefGPT-Fact
Mutonix/RefGPT-Code-ds
Mutonix/RefGPT-Code-cr
Mutonix/RefGPT-Code-bg

PocketDoc/Alpaca_Evol_Instruct_Cleaned
GAIR/lima
PocketDoc/DansPileOfSets
WizardLM/WizardLM_evol_instruct_V2_196k

Alignment-Lab-AI/AILabAssistant

tasksource/tasksource-instruct-v0
tasksource/zero-shot-label-nli
tasksource/icl-symbol-tuning-instruct

vietgpt/OIG_mathqa_flanv2_en
neural_code_search
neulab/conala
reshinthadith/synthetic_program_synthesis_python_1M
fiveflow/cot_ranking
squad_adversarial
bigscience-data/roots_zh-cn_wikipedia


knkarthick/dialogsum
jondurbin/rosettacode-raw
winddude/IHOPv01
jondurbin/rosettacode-10
Norquinal/claude_multi_instruct_1k
Open-Orca/OpenOrca
ehartford/dolphin

CheshireAI/guanaco-unchained
Salesforce/dialogstudio
theblackcat102/evol-codealpaca-v1
nickrosh/Evol-Instruct-Code-80k-v1


P1ayer-1/books-3
P1ayer-1/college_textbooks
P1ayer-1/books-3-textbooks
P1ayer-1/crash_course_subs
P1ayer-1/stack-exchange-preferences-code-v2
goendalf666/sql-chat-instructions
wenhu/TheoremQA
lukaemon/bbh
dmayhem93/agieval-lsat-ar
lmsys/chatbot_arena_conversations
shahules786/orca-chat
CarperAI/openai_summarize_comparisons
declare-lab/InstructEvalImpact
nampdn-ai/tiny-codes


declare-lab/flan-mini
allenai/peS2o
LinkSoul/instruction_merge_set

jondurbin/airoboros-gpt4-m2.0
rombodawg/MegaCodeTraining200k
causalnlp/corr2cause
ehartford/WizardLM_evol_instruct_V2_196k_unfiltered_merged_split
ehartford/open-instruct-uncensored
stingning/ultrachat

# 14 GB
ArmelR/stack-exchange-instruction

https://github.com/HKUNLP/DS-1000
https://github.com/project-baize/baize-chatbot
https://github.com/leanprover/lean4-samples
https://github.com/microsoft/promptbench
https://github.com/llm-attacks/llm-attacks
https://github.com/lz1oceani/verify_cot
https://github.com/Lichang-Chen/InstructZero
https://github.com/salesforce/factualNLG

# huge extra downloads, TODO subreddit
https://github.com/CornellNLP/ConvoKit

# huge extra downloads and mostly dup
https://github.com/allenai/open-instruct

# has extra download

https://github.com/wellecks/naturalproofs

https://github.com/brightmart/nlp_chinese_corpus
https://github.com/FranxYao/chain-of-thought-hub

https://github.com/Troyanovsky/Local-LLM-comparison

# https://people.eecs.berkeley.edu/~hendrycks/MATH.tar
# https://drive.google.com/open?id=1hQsua3TkpEmcJD_UWQx8dmNdEZPyxw23&authuser=0
https://github.com/hendrycks/math/

# https://s3.amazonaws.com/datasets.huggingface.co/scientific_papers/1.1.1/arxiv-dataset.zip
scientific_papers

# for now:
# https://opus.nlpl.eu/download.php?f=OpenSubtitles/v2018/xml/en.zip
open_subtitles

# also https://cloud.tsinghua.edu.cn/d/a5a16f2381e7439eb475/
https://github.com/thu-coai/LOT-LongLM

https://github.com/StevenGrove/GPT4Tools
https://github.com/HKUNLP/UnifiedSKG
https://github.com/RUCAIBox/StructGPT

# download_raw_data.sh + download_preprocessed_data.sh 
https://github.com/michiyasunaga/DrRepair

# https://cloud.google.com/sdk/docs/install-sdk#linux
# gsutil -m cp -r gs://dm-code_contests local_location
https://github.com/deepmind/code_contests

# use gdown to download the dataset (~4G) and notebooks (~8G)
https://github.com/rajasagashe/JuICe

# https://github.com/InternLM/opencompass/releases/download/0.1.1/OpenCompassData.zip
https://github.com/InternLM/opencompass


https://github.com/lean-dojo/ReProver
# Lean 3: https://zenodo.org/record/8016386 by https://github.com/lean-dojo/LeanDojo/blob/main/scripts/generate-benchmark-lean3.ipynb
# Lean 4: https://zenodo.org/record/8040110 by https://github.com/lean-dojo/LeanDojo/blob/main/scripts/generate-benchmark-lean4.ipynb
https://github.com/lean-dojo/LeanDojo

# should have filted

Helsinki-NLP/tatoeba_mt

# partial download
allenai/objaverse
# huge 'data/*'
tiiuae/falcon-refinedweb

# TODO: partial download
NTU-NLP-sg/xCodeEval

# huge download

# '*'
RyokoAI/CNNovel125K
RyokoAI/Fandom23K
RyokoAI/ScribbleHub17K
RyokoAI/Honeyfeed3600

# '*'
roneneldan/TinyStories

# selective download

# 'data/agda/*'  'data/coq/*' 'data/c2hs-haskell/*' 'data/f-sharp/*' 'data/idris/*' 'data/isabelle/*' 'data/julia/*' 'data/kotlin/*' 'data/lean/*'  'data/literate-agda/*' 'data/literate-haskell/*' 'data/markdown/*' 'data/mathematica/*' 'data/prolog/*' 'data/restructuredtext/*' 'data/rust/*' 'data/sage/*' 'data/tex/*'
bigcode/the-stack

# wiki.jsonl book.jsonl filtered_08cdfa755e6d4d89b673d5bd1acee5f6.sampled.jsonl arxiv_*.jsonl
togethercomputer/RedPajama-Data-1T

# 'preprocessed/adult/*' 'preprocessed/chain_of_thought/*' 'preprocessed/conversation/*' 'preprocessed/instruct/*' 'preprocessed/knowledge/*' 'preprocessed/rlhf/*' 'preprocessed/summarisation/*' 'preprocessed/system/*'
m8than/normalised_chatml_rwkvready

#
tiiuae/falcon-refinedweb

# TODO: selective download

# 'en/*' 'zh/*'
bigscience/xP3

# TODO: need extra download

allenai/lila

https://github.com/fighting41love/funNLP

https://github.com/X-PLUG/mPLUG-Owl

# TODO: too big, to foundational, need to filter and dedup

# ~3G
# SirNeural/flan_v2

# ~85G
# conceptofmind/flan_dialog_submix

# 7GB
#   ccdv/arxiv-summarization

# 311GB
# bigcode/starcoderdata

# ~90GB
# MMInstruction/M3IT
# 204GB
# MMInstruction/M3IT-80

# https://huggingface.co/datasets/allenai/c4/tree/mC4_3.1.0/multilingual
# allenai/c4
# allenai/nllb

# oscar-corpus/OSCAR-2201

# sil-ai/bloom-lm
# EleutherAI/the_pile
# EleutherAI/the_pile_deduplicated

# bigscience/bloomz

# bigscience/evaluation-results

# c4
# wikipedia
# the_pile_books3
# pg19


# TODO