From d721ed9d93b562bfaf23f97d1d6b42bde613306a Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Wed, 21 Feb 2024 15:57:04 -0800
Subject: [PATCH 01/14] optims

---
 docs/source/_toctree.yml                      | 21 ++++++++++++++++++-
 docs/source/reference/optim/adagrad.md        |  7 +++++++
 docs/source/reference/optim/adam.md           | 15 +++++++++++++
 docs/source/reference/optim/adamw.md          | 15 +++++++++++++
 docs/source/reference/optim/lamb.md           |  7 +++++++
 docs/source/reference/optim/lars.md           |  7 +++++++
 docs/source/reference/optim/lion.md           | 15 +++++++++++++
 docs/source/reference/optim/optim_overview.md | 11 ++++++++++
 docs/source/reference/optim/rmsprop.md        |  7 +++++++
 docs/source/reference/optim/sgd.md            |  7 +++++++
 docs/source/{ => reference}/quantization.mdx  |  0
 11 files changed, 111 insertions(+), 1 deletion(-)
 create mode 100644 docs/source/reference/optim/adagrad.md
 create mode 100644 docs/source/reference/optim/adam.md
 create mode 100644 docs/source/reference/optim/adamw.md
 create mode 100644 docs/source/reference/optim/lamb.md
 create mode 100644 docs/source/reference/optim/lars.md
 create mode 100644 docs/source/reference/optim/lion.md
 create mode 100644 docs/source/reference/optim/optim_overview.md
 create mode 100644 docs/source/reference/optim/rmsprop.md
 create mode 100644 docs/source/reference/optim/sgd.md
 rename docs/source/{ => reference}/quantization.mdx (100%)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 7584207d0..84c68aa22 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -26,5 +26,24 @@
     title: Papers, resources & how to cite
 - title: API reference
   sections:
-  - local: quantization
+  - local: reference/quantization
     title: Quantization
+  title: Optimizers
+  - local: reference/optim/optim_overview
+    title: Overview
+  - local: reference/optim/adagrad
+    title: AdaGrad
+  - local: reference/optim/adam
+    title: Adam
+  - local: reference/optim/AdamW
+    title: AdamW
+  - local: reference/optim/lamb
+    title: LAMB
+  - local: reference/optim/lars
+    title: LARS
+  - local: reference/optim/lion
+    title: Lion
+  - local: reference/optim/rmsprop
+    title: RMSprop
+  - local: reference/optim/sgd
+    title: SGD
diff --git a/docs/source/reference/optim/adagrad.md b/docs/source/reference/optim/adagrad.md
new file mode 100644
index 000000000..124532ea7
--- /dev/null
+++ b/docs/source/reference/optim/adagrad.md
@@ -0,0 +1,7 @@
+# AdaGrad
+
+[[autodoc]] Adagrad
+
+[[autodoc]] Adagrad8bit
+
+[[autodoc]] Adagrad32bit
diff --git a/docs/source/reference/optim/adam.md b/docs/source/reference/optim/adam.md
new file mode 100644
index 000000000..e7a5ce9b3
--- /dev/null
+++ b/docs/source/reference/optim/adam.md
@@ -0,0 +1,15 @@
+# Adam
+
+[[autodoc]] Adam
+
+[[autodoc]] Adam8bit
+
+[[autodoc]] Adam32bit
+
+## Paged Adam
+
+[[autodoc]] PagedAdam
+
+[[autodoc]] PagedAdam8bit
+
+[[autodoc]] PagedAdam32bit
diff --git a/docs/source/reference/optim/adamw.md b/docs/source/reference/optim/adamw.md
new file mode 100644
index 000000000..8ae1b77b0
--- /dev/null
+++ b/docs/source/reference/optim/adamw.md
@@ -0,0 +1,15 @@
+# AdamW
+
+[[autodoc]] AdamW
+
+[[autodoc]] AdamW8bit
+
+[[autodoc]] AdamW32bit
+
+## Paged AdamW
+
+[[autodoc]] PagedAdamW
+
+[[autodoc]] PagedAdamW8bit
+
+[[autodoc]] PagedAdamW32bit
diff --git a/docs/source/reference/optim/lamb.md b/docs/source/reference/optim/lamb.md
new file mode 100644
index 000000000..a21ebe10c
--- /dev/null
+++ b/docs/source/reference/optim/lamb.md
@@ -0,0 +1,7 @@
+# LAMB
+
+[[autodoc]] LAMB
+
+[[autodoc]] LAMB8bit
+
+[[autodoc]] LAMB32bit
diff --git a/docs/source/reference/optim/lars.md b/docs/source/reference/optim/lars.md
new file mode 100644
index 000000000..ecf75446e
--- /dev/null
+++ b/docs/source/reference/optim/lars.md
@@ -0,0 +1,7 @@
+# LARS
+
+[[autodoc]] LARS
+
+[[autodoc]] LARS8bit
+
+[[autodoc]] LARS32bit
diff --git a/docs/source/reference/optim/lion.md b/docs/source/reference/optim/lion.md
new file mode 100644
index 000000000..8eaa9aa22
--- /dev/null
+++ b/docs/source/reference/optim/lion.md
@@ -0,0 +1,15 @@
+# Lion
+
+[[autodoc]] Lion
+
+[[autodoc]] Lion8bit
+
+[[autodoc]] Lion32bit
+
+## Paged Lion
+
+[[autodoc]] PagedLion
+
+[[autodoc]] PagedLion8bit
+
+[[autodoc]] PagedLion32bit
diff --git a/docs/source/reference/optim/optim_overview.md b/docs/source/reference/optim/optim_overview.md
new file mode 100644
index 000000000..eb824b7a9
--- /dev/null
+++ b/docs/source/reference/optim/optim_overview.md
@@ -0,0 +1,11 @@
+# Overview
+
+[[autodoc]] Optimizer8bit
+
+[[autodoc]] Optimizer2State
+
+[[autodoc]] Optimizer1State
+
+## Utilities
+
+[[autodoc]] GlobalOptimManager
diff --git a/docs/source/reference/optim/rmsprop.md b/docs/source/reference/optim/rmsprop.md
new file mode 100644
index 000000000..8e014e521
--- /dev/null
+++ b/docs/source/reference/optim/rmsprop.md
@@ -0,0 +1,7 @@
+# RMSprop
+
+[[autodoc]] RMSprop
+
+[[autodoc]] RMSprop8bit
+
+[[autodoc]] RMSprop32bit
diff --git a/docs/source/reference/optim/sgd.md b/docs/source/reference/optim/sgd.md
new file mode 100644
index 000000000..da5f9b954
--- /dev/null
+++ b/docs/source/reference/optim/sgd.md
@@ -0,0 +1,7 @@
+# SGD
+
+[[autodoc]] SGD
+
+[[autodoc]] SGD8bit
+
+[[autodoc]] SGD32bit
diff --git a/docs/source/quantization.mdx b/docs/source/reference/quantization.mdx
similarity index 100%
rename from docs/source/quantization.mdx
rename to docs/source/reference/quantization.mdx

From c0928842d4591593b410036caa2a567bdba0390d Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Wed, 21 Feb 2024 16:05:43 -0800
Subject: [PATCH 02/14] fix path

---
 docs/source/reference/optim/adagrad.md        |  6 +++---
 docs/source/reference/optim/adam.md           | 12 ++++++------
 docs/source/reference/optim/adamw.md          | 12 ++++++------
 docs/source/reference/optim/lamb.md           |  6 +++---
 docs/source/reference/optim/lars.md           |  6 +++---
 docs/source/reference/optim/lion.md           | 12 ++++++------
 docs/source/reference/optim/optim_overview.md |  8 ++++----
 docs/source/reference/optim/rmsprop.md        |  6 +++---
 docs/source/reference/optim/sgd.md            |  6 +++---
 9 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/docs/source/reference/optim/adagrad.md b/docs/source/reference/optim/adagrad.md
index 124532ea7..f4554343e 100644
--- a/docs/source/reference/optim/adagrad.md
+++ b/docs/source/reference/optim/adagrad.md
@@ -1,7 +1,7 @@
 # AdaGrad
 
-[[autodoc]] Adagrad
+[[autodoc]] optim.Adagrad
 
-[[autodoc]] Adagrad8bit
+[[autodoc]] optim.Adagrad8bit
 
-[[autodoc]] Adagrad32bit
+[[autodoc]] optim.Adagrad32bit
diff --git a/docs/source/reference/optim/adam.md b/docs/source/reference/optim/adam.md
index e7a5ce9b3..e2fa052a2 100644
--- a/docs/source/reference/optim/adam.md
+++ b/docs/source/reference/optim/adam.md
@@ -1,15 +1,15 @@
 # Adam
 
-[[autodoc]] Adam
+[[autodoc]] optim.Adam
 
-[[autodoc]] Adam8bit
+[[autodoc]] optim.Adam8bit
 
-[[autodoc]] Adam32bit
+[[autodoc]] optim.Adam32bit
 
 ## Paged Adam
 
-[[autodoc]] PagedAdam
+[[autodoc]] optim.PagedAdam
 
-[[autodoc]] PagedAdam8bit
+[[autodoc]] optim.PagedAdam8bit
 
-[[autodoc]] PagedAdam32bit
+[[autodoc]] optim.PagedAdam32bit
diff --git a/docs/source/reference/optim/adamw.md b/docs/source/reference/optim/adamw.md
index 8ae1b77b0..6cf0e9cc5 100644
--- a/docs/source/reference/optim/adamw.md
+++ b/docs/source/reference/optim/adamw.md
@@ -1,15 +1,15 @@
 # AdamW
 
-[[autodoc]] AdamW
+[[autodoc]] optim.AdamW
 
-[[autodoc]] AdamW8bit
+[[autodoc]] optim.AdamW8bit
 
-[[autodoc]] AdamW32bit
+[[autodoc]] optim.AdamW32bit
 
 ## Paged AdamW
 
-[[autodoc]] PagedAdamW
+[[autodoc]] optim.PagedAdamW
 
-[[autodoc]] PagedAdamW8bit
+[[autodoc]] optim.PagedAdamW8bit
 
-[[autodoc]] PagedAdamW32bit
+[[autodoc]] optim.PagedAdamW32bit
diff --git a/docs/source/reference/optim/lamb.md b/docs/source/reference/optim/lamb.md
index a21ebe10c..b43d3565f 100644
--- a/docs/source/reference/optim/lamb.md
+++ b/docs/source/reference/optim/lamb.md
@@ -1,7 +1,7 @@
 # LAMB
 
-[[autodoc]] LAMB
+[[autodoc]] optim.LAMB
 
-[[autodoc]] LAMB8bit
+[[autodoc]] optim.LAMB8bit
 
-[[autodoc]] LAMB32bit
+[[autodoc]] optim.LAMB32bit
diff --git a/docs/source/reference/optim/lars.md b/docs/source/reference/optim/lars.md
index ecf75446e..755e463be 100644
--- a/docs/source/reference/optim/lars.md
+++ b/docs/source/reference/optim/lars.md
@@ -1,7 +1,7 @@
 # LARS
 
-[[autodoc]] LARS
+[[autodoc]] optim.LARS
 
-[[autodoc]] LARS8bit
+[[autodoc]] optim.LARS8bit
 
-[[autodoc]] LARS32bit
+[[autodoc]] optim.LARS32bit
diff --git a/docs/source/reference/optim/lion.md b/docs/source/reference/optim/lion.md
index 8eaa9aa22..f77aa1cb8 100644
--- a/docs/source/reference/optim/lion.md
+++ b/docs/source/reference/optim/lion.md
@@ -1,15 +1,15 @@
 # Lion
 
-[[autodoc]] Lion
+[[autodoc]] optim.Lion
 
-[[autodoc]] Lion8bit
+[[autodoc]] optim.Lion8bit
 
-[[autodoc]] Lion32bit
+[[autodoc]] optim.Lion32bit
 
 ## Paged Lion
 
-[[autodoc]] PagedLion
+[[autodoc]] optim.PagedLion
 
-[[autodoc]] PagedLion8bit
+[[autodoc]] optim.PagedLion8bit
 
-[[autodoc]] PagedLion32bit
+[[autodoc]] optim.PagedLion32bit
diff --git a/docs/source/reference/optim/optim_overview.md b/docs/source/reference/optim/optim_overview.md
index eb824b7a9..68c1d0219 100644
--- a/docs/source/reference/optim/optim_overview.md
+++ b/docs/source/reference/optim/optim_overview.md
@@ -1,11 +1,11 @@
 # Overview
 
-[[autodoc]] Optimizer8bit
+[[autodoc]] optim.Optimizer8bit
 
-[[autodoc]] Optimizer2State
+[[autodoc]] optim.Optimizer2State
 
-[[autodoc]] Optimizer1State
+[[autodoc]] optim.Optimizer1State
 
 ## Utilities
 
-[[autodoc]] GlobalOptimManager
+[[autodoc]] optim.GlobalOptimManager
diff --git a/docs/source/reference/optim/rmsprop.md b/docs/source/reference/optim/rmsprop.md
index 8e014e521..77a272417 100644
--- a/docs/source/reference/optim/rmsprop.md
+++ b/docs/source/reference/optim/rmsprop.md
@@ -1,7 +1,7 @@
 # RMSprop
 
-[[autodoc]] RMSprop
+[[autodoc]] optim.RMSprop
 
-[[autodoc]] RMSprop8bit
+[[autodoc]] optim.RMSprop8bit
 
-[[autodoc]] RMSprop32bit
+[[autodoc]] optim.RMSprop32bit
diff --git a/docs/source/reference/optim/sgd.md b/docs/source/reference/optim/sgd.md
index da5f9b954..8b0b1c48c 100644
--- a/docs/source/reference/optim/sgd.md
+++ b/docs/source/reference/optim/sgd.md
@@ -1,7 +1,7 @@
 # SGD
 
-[[autodoc]] SGD
+[[autodoc]] optim.SGD
 
-[[autodoc]] SGD8bit
+[[autodoc]] optim.SGD8bit
 
-[[autodoc]] SGD32bit
+[[autodoc]] optim.SGD32bit

From c02d0c25c5e05e2eacec39de2eee01f6bf02cbc9 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Wed, 21 Feb 2024 16:11:37 -0800
Subject: [PATCH 03/14] fix path

---
 docs/source/reference/optim/adagrad.md        |  6 +++---
 docs/source/reference/optim/adam.md           | 12 ++++++------
 docs/source/reference/optim/adamw.md          | 12 ++++++------
 docs/source/reference/optim/lamb.md           |  6 +++---
 docs/source/reference/optim/lars.md           |  6 +++---
 docs/source/reference/optim/lion.md           | 12 ++++++------
 docs/source/reference/optim/optim_overview.md |  8 ++++----
 docs/source/reference/optim/rmsprop.md        |  6 +++---
 docs/source/reference/optim/sgd.md            |  6 +++---
 9 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/docs/source/reference/optim/adagrad.md b/docs/source/reference/optim/adagrad.md
index f4554343e..f90c9452a 100644
--- a/docs/source/reference/optim/adagrad.md
+++ b/docs/source/reference/optim/adagrad.md
@@ -1,7 +1,7 @@
 # AdaGrad
 
-[[autodoc]] optim.Adagrad
+[[autodoc]] bitsandbytes.optim.Adagrad
 
-[[autodoc]] optim.Adagrad8bit
+[[autodoc]] bitsandbytes.optim.Adagrad8bit
 
-[[autodoc]] optim.Adagrad32bit
+[[autodoc]] bitsandbytes.optim.Adagrad32bit
diff --git a/docs/source/reference/optim/adam.md b/docs/source/reference/optim/adam.md
index e2fa052a2..0bb12ca80 100644
--- a/docs/source/reference/optim/adam.md
+++ b/docs/source/reference/optim/adam.md
@@ -1,15 +1,15 @@
 # Adam
 
-[[autodoc]] optim.Adam
+[[autodoc]] bitsandbytes.optim.Adam
 
-[[autodoc]] optim.Adam8bit
+[[autodoc]] bitsandbytes.optim.Adam8bit
 
-[[autodoc]] optim.Adam32bit
+[[autodoc]] bitsandbytes.optim.Adam32bit
 
 ## Paged Adam
 
-[[autodoc]] optim.PagedAdam
+[[autodoc]] bitsandbytes.optim.PagedAdam
 
-[[autodoc]] optim.PagedAdam8bit
+[[autodoc]] bitsandbytes.optim.PagedAdam8bit
 
-[[autodoc]] optim.PagedAdam32bit
+[[autodoc]] bitsandbytes.optim.PagedAdam32bit
diff --git a/docs/source/reference/optim/adamw.md b/docs/source/reference/optim/adamw.md
index 6cf0e9cc5..9e85716df 100644
--- a/docs/source/reference/optim/adamw.md
+++ b/docs/source/reference/optim/adamw.md
@@ -1,15 +1,15 @@
 # AdamW
 
-[[autodoc]] optim.AdamW
+[[autodoc]] bitsandbytes.optim.AdamW
 
-[[autodoc]] optim.AdamW8bit
+[[autodoc]] bitsandbytes.optim.AdamW8bit
 
-[[autodoc]] optim.AdamW32bit
+[[autodoc]] bitsandbytes.optim.AdamW32bit
 
 ## Paged AdamW
 
-[[autodoc]] optim.PagedAdamW
+[[autodoc]] bitsandbytes.optim.PagedAdamW
 
-[[autodoc]] optim.PagedAdamW8bit
+[[autodoc]] bitsandbytes.optim.PagedAdamW8bit
 
-[[autodoc]] optim.PagedAdamW32bit
+[[autodoc]] bitsandbytes.optim.PagedAdamW32bit
diff --git a/docs/source/reference/optim/lamb.md b/docs/source/reference/optim/lamb.md
index b43d3565f..ab583ed91 100644
--- a/docs/source/reference/optim/lamb.md
+++ b/docs/source/reference/optim/lamb.md
@@ -1,7 +1,7 @@
 # LAMB
 
-[[autodoc]] optim.LAMB
+[[autodoc]] bitsandbytes.optim.LAMB
 
-[[autodoc]] optim.LAMB8bit
+[[autodoc]] bitsandbytes.optim.LAMB8bit
 
-[[autodoc]] optim.LAMB32bit
+[[autodoc]] bitsandbytes.optim.LAMB32bit
diff --git a/docs/source/reference/optim/lars.md b/docs/source/reference/optim/lars.md
index 755e463be..b5dde29d0 100644
--- a/docs/source/reference/optim/lars.md
+++ b/docs/source/reference/optim/lars.md
@@ -1,7 +1,7 @@
 # LARS
 
-[[autodoc]] optim.LARS
+[[autodoc]] bitsandbytes.optim.LARS
 
-[[autodoc]] optim.LARS8bit
+[[autodoc]] bitsandbytes.optim.LARS8bit
 
-[[autodoc]] optim.LARS32bit
+[[autodoc]] bitsandbytes.optim.LARS32bit
diff --git a/docs/source/reference/optim/lion.md b/docs/source/reference/optim/lion.md
index f77aa1cb8..a9f849a3e 100644
--- a/docs/source/reference/optim/lion.md
+++ b/docs/source/reference/optim/lion.md
@@ -1,15 +1,15 @@
 # Lion
 
-[[autodoc]] optim.Lion
+[[autodoc]] bitsandbytes.optim.Lion
 
-[[autodoc]] optim.Lion8bit
+[[autodoc]] bitsandbytes.optim.Lion8bit
 
-[[autodoc]] optim.Lion32bit
+[[autodoc]] bitsandbytes.optim.Lion32bit
 
 ## Paged Lion
 
-[[autodoc]] optim.PagedLion
+[[autodoc]] bitsandbytes.optim.PagedLion
 
-[[autodoc]] optim.PagedLion8bit
+[[autodoc]] bitsandbytes.optim.PagedLion8bit
 
-[[autodoc]] optim.PagedLion32bit
+[[autodoc]] bitsandbytes.optim.PagedLion32bit
diff --git a/docs/source/reference/optim/optim_overview.md b/docs/source/reference/optim/optim_overview.md
index 68c1d0219..87ec4b76d 100644
--- a/docs/source/reference/optim/optim_overview.md
+++ b/docs/source/reference/optim/optim_overview.md
@@ -1,11 +1,11 @@
 # Overview
 
-[[autodoc]] optim.Optimizer8bit
+[[autodoc]] bitsandbytes.optim.Optimizer8bit
 
-[[autodoc]] optim.Optimizer2State
+[[autodoc]] bitsandbytes.optim.Optimizer2State
 
-[[autodoc]] optim.Optimizer1State
+[[autodoc]] bitsandbytes.optim.Optimizer1State
 
 ## Utilities
 
-[[autodoc]] optim.GlobalOptimManager
+[[autodoc]] bitsandbytes.optim.GlobalOptimManager
diff --git a/docs/source/reference/optim/rmsprop.md b/docs/source/reference/optim/rmsprop.md
index 77a272417..2ecb7f579 100644
--- a/docs/source/reference/optim/rmsprop.md
+++ b/docs/source/reference/optim/rmsprop.md
@@ -1,7 +1,7 @@
 # RMSprop
 
-[[autodoc]] optim.RMSprop
+[[autodoc]] bitsandbytes.optim.RMSprop
 
-[[autodoc]] optim.RMSprop8bit
+[[autodoc]] bitsandbytes.optim.RMSprop8bit
 
-[[autodoc]] optim.RMSprop32bit
+[[autodoc]] bitsandbytes.optim.RMSprop32bit
diff --git a/docs/source/reference/optim/sgd.md b/docs/source/reference/optim/sgd.md
index 8b0b1c48c..3c24edcd2 100644
--- a/docs/source/reference/optim/sgd.md
+++ b/docs/source/reference/optim/sgd.md
@@ -1,7 +1,7 @@
 # SGD
 
-[[autodoc]] optim.SGD
+[[autodoc]] bitsandbytes.optim.SGD
 
-[[autodoc]] optim.SGD8bit
+[[autodoc]] bitsandbytes.optim.SGD8bit
 
-[[autodoc]] optim.SGD32bit
+[[autodoc]] bitsandbytes.optim.SGD32bit

From 679499605ecc76c4ab8c4681a1306e4c04d716bc Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Wed, 21 Feb 2024 16:17:37 -0800
Subject: [PATCH 04/14] mdx

---
 docs/source/reference/optim/{adagrad.md => adagrad.mdx}           | 0
 docs/source/reference/optim/{adam.md => adam.mdx}                 | 0
 docs/source/reference/optim/{adamw.md => adamw.mdx}               | 0
 docs/source/reference/optim/{lamb.md => lamb.mdx}                 | 0
 docs/source/reference/optim/{lars.md => lars.mdx}                 | 0
 docs/source/reference/optim/{lion.md => lion.mdx}                 | 0
 .../reference/optim/{optim_overview.md => optim_overview.mdx}     | 0
 docs/source/reference/optim/{rmsprop.md => rmsprop.mdx}           | 0
 docs/source/reference/optim/{sgd.md => sgd.mdx}                   | 0
 9 files changed, 0 insertions(+), 0 deletions(-)
 rename docs/source/reference/optim/{adagrad.md => adagrad.mdx} (100%)
 rename docs/source/reference/optim/{adam.md => adam.mdx} (100%)
 rename docs/source/reference/optim/{adamw.md => adamw.mdx} (100%)
 rename docs/source/reference/optim/{lamb.md => lamb.mdx} (100%)
 rename docs/source/reference/optim/{lars.md => lars.mdx} (100%)
 rename docs/source/reference/optim/{lion.md => lion.mdx} (100%)
 rename docs/source/reference/optim/{optim_overview.md => optim_overview.mdx} (100%)
 rename docs/source/reference/optim/{rmsprop.md => rmsprop.mdx} (100%)
 rename docs/source/reference/optim/{sgd.md => sgd.mdx} (100%)

diff --git a/docs/source/reference/optim/adagrad.md b/docs/source/reference/optim/adagrad.mdx
similarity index 100%
rename from docs/source/reference/optim/adagrad.md
rename to docs/source/reference/optim/adagrad.mdx
diff --git a/docs/source/reference/optim/adam.md b/docs/source/reference/optim/adam.mdx
similarity index 100%
rename from docs/source/reference/optim/adam.md
rename to docs/source/reference/optim/adam.mdx
diff --git a/docs/source/reference/optim/adamw.md b/docs/source/reference/optim/adamw.mdx
similarity index 100%
rename from docs/source/reference/optim/adamw.md
rename to docs/source/reference/optim/adamw.mdx
diff --git a/docs/source/reference/optim/lamb.md b/docs/source/reference/optim/lamb.mdx
similarity index 100%
rename from docs/source/reference/optim/lamb.md
rename to docs/source/reference/optim/lamb.mdx
diff --git a/docs/source/reference/optim/lars.md b/docs/source/reference/optim/lars.mdx
similarity index 100%
rename from docs/source/reference/optim/lars.md
rename to docs/source/reference/optim/lars.mdx
diff --git a/docs/source/reference/optim/lion.md b/docs/source/reference/optim/lion.mdx
similarity index 100%
rename from docs/source/reference/optim/lion.md
rename to docs/source/reference/optim/lion.mdx
diff --git a/docs/source/reference/optim/optim_overview.md b/docs/source/reference/optim/optim_overview.mdx
similarity index 100%
rename from docs/source/reference/optim/optim_overview.md
rename to docs/source/reference/optim/optim_overview.mdx
diff --git a/docs/source/reference/optim/rmsprop.md b/docs/source/reference/optim/rmsprop.mdx
similarity index 100%
rename from docs/source/reference/optim/rmsprop.md
rename to docs/source/reference/optim/rmsprop.mdx
diff --git a/docs/source/reference/optim/sgd.md b/docs/source/reference/optim/sgd.mdx
similarity index 100%
rename from docs/source/reference/optim/sgd.md
rename to docs/source/reference/optim/sgd.mdx

From 8e2a6105c3d27bbad315272f59afb5d1ca265227 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Wed, 21 Feb 2024 16:24:29 -0800
Subject: [PATCH 05/14] fix path

---
 docs/source/reference/optim/optim_overview.mdx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/reference/optim/optim_overview.mdx b/docs/source/reference/optim/optim_overview.mdx
index 87ec4b76d..7f5d221b0 100644
--- a/docs/source/reference/optim/optim_overview.mdx
+++ b/docs/source/reference/optim/optim_overview.mdx
@@ -1,11 +1,11 @@
 # Overview
 
-[[autodoc]] bitsandbytes.optim.Optimizer8bit
+[[autodoc]] bitsandbytes.optim.optimizer.Optimizer8bit
 
-[[autodoc]] bitsandbytes.optim.Optimizer2State
+[[autodoc]] bitsandbytes.optim.optimizer.Optimizer2State
 
-[[autodoc]] bitsandbytes.optim.Optimizer1State
+[[autodoc]] bitsandbytes.optim.optimizer.Optimizer1State
 
 ## Utilities
 
-[[autodoc]] bitsandbytes.optim.GlobalOptimManager
+[[autodoc]] bitsandbytes.optim.optimizer.GlobalOptimManager

From 8fb2ac834f99228ede557bcd023da2b66ab6857f Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Wed, 21 Feb 2024 16:31:24 -0800
Subject: [PATCH 06/14] toctree

---
 docs/source/_toctree.yml | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 84c68aa22..7cbbae25c 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -28,22 +28,23 @@
   sections:
   - local: reference/quantization
     title: Quantization
-  title: Optimizers
-  - local: reference/optim/optim_overview
-    title: Overview
-  - local: reference/optim/adagrad
-    title: AdaGrad
-  - local: reference/optim/adam
-    title: Adam
-  - local: reference/optim/AdamW
-    title: AdamW
-  - local: reference/optim/lamb
-    title: LAMB
-  - local: reference/optim/lars
-    title: LARS
-  - local: reference/optim/lion
-    title: Lion
-  - local: reference/optim/rmsprop
-    title: RMSprop
-  - local: reference/optim/sgd
-    title: SGD
+  - title: Optimizers
+    sections:
+    - local: reference/optim/optim_overview
+      title: Overview
+    - local: reference/optim/adagrad
+      title: AdaGrad
+    - local: reference/optim/adam
+      title: Adam
+    - local: reference/optim/AdamW
+      title: AdamW
+    - local: reference/optim/lamb
+      title: LAMB
+    - local: reference/optim/lars
+      title: LARS
+    - local: reference/optim/lion
+      title: Lion
+    - local: reference/optim/rmsprop
+      title: RMSprop
+    - local: reference/optim/sgd
+      title: SGD

From 5d4245b478182b380efa48c87b52554a8d918179 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Wed, 21 Feb 2024 16:34:47 -0800
Subject: [PATCH 07/14] fix

---
 docs/source/_toctree.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 7cbbae25c..6db060286 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -36,7 +36,7 @@
       title: AdaGrad
     - local: reference/optim/adam
       title: Adam
-    - local: reference/optim/AdamW
+    - local: reference/optim/adamw
       title: AdamW
     - local: reference/optim/lamb
       title: LAMB

From f5d02060ec892b3e28b45107cd2b796eee37da36 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Thu, 22 Feb 2024 12:34:00 -0800
Subject: [PATCH 08/14] optimizer, adagrad

---
 bitsandbytes/optim/adagrad.py           |  81 ++++++++++++++
 bitsandbytes/optim/optimizer.py         | 140 ++++++++++++++++++++----
 docs/source/reference/optim/adagrad.mdx |   7 ++
 3 files changed, 207 insertions(+), 21 deletions(-)

diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py
index 7d8df58ac..98d228a85 100644
--- a/bitsandbytes/optim/adagrad.py
+++ b/bitsandbytes/optim/adagrad.py
@@ -20,6 +20,33 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        Base Adagrad optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-2):
+                The learning rate.
+            lr_decay (`int`, defaults to 0):
+                The learning rate decay.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            initial_accumulator_value (`int`, defaults to 0):
+                The initial momemtum values.
+            eps (`float`, defaults to 1e-10):
+                The epsilon value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= weight_decay:
@@ -62,6 +89,33 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        8-bit Adagrad optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-2):
+                The learning rate.
+            lr_decay (`int`, defaults to 0):
+                The learning rate decay.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            initial_accumulator_value (`int`, defaults to 0):
+                The initial momemtum values.
+            eps (`float`, defaults to 1e-10):
+                The epsilon value for the optimizer.
+            optim_bits (`int`, defaults to 8):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= weight_decay:
@@ -105,6 +159,33 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        32-bit Adagrad optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-2):
+                The learning rate.
+            lr_decay (`int`, defaults to 0):
+                The learning rate decay.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            initial_accumulator_value (`int`, defaults to 0):
+                The initial momemtum values.
+            eps (`float`, defaults to 1e-10):
+                The epsilon value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """        
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= weight_decay:
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index 8254d16b4..c21b3d7d3 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -18,6 +18,9 @@ def __init__(self, initial_data):
 
 
 class GlobalOptimManager:
+    """
+    A global optimizer manager for enabling custom optimizer configs.
+    """
     _instance = None
 
     def __init__(self):
@@ -53,22 +56,40 @@ def override_config(
         self, parameters, key=None, value=None, key_value_dict=None
     ):
         """
-        Overrides initial optimizer config for specific parameters.
+        Override initial optimizer config with specific hyperparameters.
 
         The key-values of the optimizer config for the input parameters are overridden
-        This can be both, optimizer parameters like "betas", or "lr" or it can be
-        8-bit specific parameters like "optim_bits", "percentile_clipping".
-
-        Parameters
-        ----------
-        parameters : torch.Tensor or list(torch.Tensors)
-            The input parameters.
-        key : str
-            The hyperparamter to override.
-        value : object
-            The value for the hyperparamters.
-        key_value_dict : dict
-            A dictionary with multiple key-values to override.
+        This can be both, optimizer parameters like "betas" or "lr", or it can be
+        8-bit specific parameters like "optim_bits" or "percentile_clipping".
+
+        Arguments:
+           parameters (`torch.Tensor` or `list(torch.Tensors)`):
+             The input parameters.
+           key (`str`):
+             The hyperparamter to override.
+           value:
+             The hyperparameter values.
+           key_value_dict (`dict`):
+             A dictionary with multiple key-values to override.
+
+        Example:
+
+        ```py
+        import torch
+        import bitsandbytes as bnb
+
+        mng = bnb.optim.GlobalOptimManager.get_instance()
+
+        model = MyModel()
+        mng.register_parameters(model.parameters()) # 1. register parameters while still on CPU
+
+        model = model.cuda()
+        # use 8-bit optimizer states for all parameters
+        adam = bnb.optim.Adam(model.parameters(), lr=0.001, optim_bits=8)
+
+        # 2. override: the parameter model.fc1.weight now uses 32-bit Adam
+        mng.override_config(model.fc1.weight, 'optim_bits', 32)
+        ```
         """
         self.uses_config_override = True
         if isinstance(parameters, torch.nn.Parameter):
@@ -92,6 +113,17 @@ def register_module_override(self, module, param_name, config):
 
 class Optimizer8bit(torch.optim.Optimizer):
     def __init__(self, params, defaults, optim_bits=32, is_paged=False):
+        """
+        Base 8-bit optimizer class.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__(params, defaults)
         self.initialized = False
         self.name2qmap = {}
@@ -125,11 +157,11 @@ def __setstate__(self, state):
         super().__setstate__(state)
 
     def load_state_dict(self, state_dict):
-        r"""Loads the optimizer state.
+        """Load an optimizer state.
 
-        Args:
-            state_dict (dict): optimizer state. Should be an object returned
-                from a call to :meth:`state_dict`.
+        Arguments:
+            state_dict (`dict`): 
+                An optimizer state (should be returned from a call to `state_dict`) to load.
         """
         # deepcopy, to be consistent with module API
         state_dict = deepcopy(state_dict)
@@ -237,11 +269,11 @@ def check_overrides(self):
 
     @torch.no_grad()
     def step(self, closure=None):
-        """Performs a single optimization step.
+        """Perform a single optimization step.
 
         Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
+            closure (`Callable`, *optional*, defaults to `None`): 
+                A closure that reevaluates the model and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -339,6 +371,39 @@ def __init__(
         skip_zeros=False,
         is_paged=False
     ):
+        """
+        Base 2-state update optimizer class.
+
+        Arguments:
+            optimizer_name (`str`):
+                The name of the optimizer.
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple`, defaults to (0.9, 0.999)):
+                The beta values for the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value for the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            max_unorm (`float`, defaults to 0.0):
+                The maximum value to normalize each block with.
+            skip_zeros (`bool`, defaults to `False`):
+                Whether to skip zero values for sparse gradients and models to ensure correct updates.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= eps:
@@ -552,6 +617,39 @@ def __init__(
         skip_zeros=False,
         is_paged=False
     ):
+        """
+        Base 1-state update optimizer class.
+
+        Arguments:
+            optimizer_name (`str`):
+                The name of the optimizer.
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple`, defaults to (0.9, 0.0)):
+                The beta values for the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value for the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            max_unorm (`float`, defaults to 0.0):
+                The maximum value to normalize each block with.
+            skip_zeros (`bool`, defaults to `False`):
+                Whether to skip zero values for sparse gradients and models to ensure correct updates.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.        
+        """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= eps:
diff --git a/docs/source/reference/optim/adagrad.mdx b/docs/source/reference/optim/adagrad.mdx
index f90c9452a..161c14a62 100644
--- a/docs/source/reference/optim/adagrad.mdx
+++ b/docs/source/reference/optim/adagrad.mdx
@@ -1,5 +1,12 @@
 # AdaGrad
 
+[AdaGrad (Adaptive Gradient)](https://jmlr.org/papers/v12/duchi11a.html) is an optimizer that adaptively adjusts the learning rate for each parameter based on their historical gradients.
+
+* Parameters with larger gradients are updated with smaller learning rates to avoid overshooting the minimum. 
+* Parameters with smaller gradients are updated with larger learning rates to catch up and converge faster.
+
+Since learning rates are automatically adjusted, AdaGrad does not require manually tuning learning rates.
+
 [[autodoc]] bitsandbytes.optim.Adagrad
 
 [[autodoc]] bitsandbytes.optim.Adagrad8bit

From 5e1cda5d052ae1016d7f2984fc6311c7a17e70ab Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Thu, 22 Feb 2024 12:44:28 -0800
Subject: [PATCH 09/14] add init

---
 docs/source/reference/optim/adagrad.mdx        | 3 +++
 docs/source/reference/optim/optim_overview.mdx | 3 +++
 2 files changed, 6 insertions(+)

diff --git a/docs/source/reference/optim/adagrad.mdx b/docs/source/reference/optim/adagrad.mdx
index 161c14a62..7bc0f3040 100644
--- a/docs/source/reference/optim/adagrad.mdx
+++ b/docs/source/reference/optim/adagrad.mdx
@@ -8,7 +8,10 @@
 Since learning rates are automatically adjusted, AdaGrad does not require manually tuning learning rates.
 
 [[autodoc]] bitsandbytes.optim.Adagrad
+    - __init__
 
 [[autodoc]] bitsandbytes.optim.Adagrad8bit
+    - __init__
 
 [[autodoc]] bitsandbytes.optim.Adagrad32bit
+    - __init__
diff --git a/docs/source/reference/optim/optim_overview.mdx b/docs/source/reference/optim/optim_overview.mdx
index 7f5d221b0..a4b5482a8 100644
--- a/docs/source/reference/optim/optim_overview.mdx
+++ b/docs/source/reference/optim/optim_overview.mdx
@@ -1,10 +1,13 @@
 # Overview
 
 [[autodoc]] bitsandbytes.optim.optimizer.Optimizer8bit
+    - __init__
 
 [[autodoc]] bitsandbytes.optim.optimizer.Optimizer2State
+    - __init__
 
 [[autodoc]] bitsandbytes.optim.optimizer.Optimizer1State
+    - __init__
 
 ## Utilities
 

From 9f77a71046183881c55b271f45dd6fc92c8571e7 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Tue, 27 Feb 2024 11:00:03 -0800
Subject: [PATCH 10/14] add

---
 bitsandbytes/optim/adagrad.py                 |   8 +-
 bitsandbytes/optim/adam.py                    | 174 ++++++++++++++++++
 bitsandbytes/optim/adamw.py                   | 174 ++++++++++++++++++
 bitsandbytes/optim/lamb.py                    |  95 ++++++++++
 bitsandbytes/optim/lars.py                    |  77 ++++++++
 bitsandbytes/optim/lion.py                    | 140 ++++++++++++++
 bitsandbytes/optim/optimizer.py               |  10 +-
 bitsandbytes/optim/rmsprop.py                 |  87 +++++++++
 bitsandbytes/optim/sgd.py                     |  77 ++++++++
 docs/source/reference/optim/adagrad.mdx       |  11 +-
 docs/source/reference/optim/adam.mdx          |  25 ++-
 docs/source/reference/optim/adamw.mdx         |  21 ++-
 docs/source/reference/optim/lamb.mdx          |  14 ++
 docs/source/reference/optim/lars.mdx          |  11 ++
 docs/source/reference/optim/lion.mdx          |  20 +-
 .../source/reference/optim/optim_overview.mdx |  10 +
 docs/source/reference/optim/rmsprop.mdx       |   8 +
 docs/source/reference/optim/sgd.mdx           |  13 ++
 18 files changed, 958 insertions(+), 17 deletions(-)

diff --git a/bitsandbytes/optim/adagrad.py b/bitsandbytes/optim/adagrad.py
index 98d228a85..c2ea87ab0 100644
--- a/bitsandbytes/optim/adagrad.py
+++ b/bitsandbytes/optim/adagrad.py
@@ -35,7 +35,7 @@ def __init__(
             initial_accumulator_value (`int`, defaults to 0):
                 The initial momemtum values.
             eps (`float`, defaults to 1e-10):
-                The epsilon value for the optimizer.
+                The epsilon value prevents division by zero in the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
             args (`dict`, defaults to `None`):
@@ -104,7 +104,7 @@ def __init__(
             initial_accumulator_value (`int`, defaults to 0):
                 The initial momemtum values.
             eps (`float`, defaults to 1e-10):
-                The epsilon value for the optimizer.
+                The epsilon value prevents division by zero in the optimizer.
             optim_bits (`int`, defaults to 8):
                 The number of bits of the optimizer state.
             args (`dict`, defaults to `None`):
@@ -174,7 +174,7 @@ def __init__(
             initial_accumulator_value (`int`, defaults to 0):
                 The initial momemtum values.
             eps (`float`, defaults to 1e-10):
-                The epsilon value for the optimizer.
+                The epsilon value prevents division by zero in the optimizer.
             optim_bits (`int`, defaults to 32):
                 The number of bits of the optimizer state.
             args (`dict`, defaults to `None`):
@@ -185,7 +185,7 @@ def __init__(
                 Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
             block_wise (`bool`, defaults to `True`):
                 Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
-        """        
+        """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
         if not 0.0 <= weight_decay:
diff --git a/bitsandbytes/optim/adam.py b/bitsandbytes/optim/adam.py
index 86981eb86..e534c8b8f 100644
--- a/bitsandbytes/optim/adam.py
+++ b/bitsandbytes/optim/adam.py
@@ -16,31 +16,205 @@
 class Adam(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        Base Adam optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged)
 
 class Adam8bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        8-bit Adam optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged)
 
 class Adam32bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        32-bit Adam optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged)
 
 class PagedAdam(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        Paged Adam optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
 
 class PagedAdam8bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        8-bit paged Adam optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
 
 class PagedAdam32bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        Paged 32-bit Adam optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
 
 class AnalysisAdam(torch.optim.Optimizer):
diff --git a/bitsandbytes/optim/adamw.py b/bitsandbytes/optim/adamw.py
index 17383eed5..1e2dc04de 100644
--- a/bitsandbytes/optim/adamw.py
+++ b/bitsandbytes/optim/adamw.py
@@ -8,30 +8,204 @@
 class AdamW(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        Base AdamW optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged )
 
 class AdamW8bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        8-bit AdamW optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged )
 
 class AdamW32bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        32-bit AdamW optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged)
 
 
 class PagedAdamW(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        """
+        Paged AdamW optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
 
 class PagedAdamW8bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        """
+        Paged 8-bit AdamW optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
 
 class PagedAdamW32bit(Optimizer2State):
     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=1e-2, amsgrad=False, optim_bits=32,
                        args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        """
+        Paged 32-bit AdamW optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__( "adam", params, lr, betas, eps, weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
diff --git a/bitsandbytes/optim/lamb.py b/bitsandbytes/optim/lamb.py
index 1fbb6fadc..ec829ee85 100644
--- a/bitsandbytes/optim/lamb.py
+++ b/bitsandbytes/optim/lamb.py
@@ -23,6 +23,39 @@ def __init__(
         block_wise=False,
         max_unorm=1.0,
     ):
+        """
+        Base LAMB optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            bias_correction (`bool`, defaults to `True`):
+                Whether to apply bias correction to the first and second-order moments.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            adam_w_mode (`bool`, defaults to `True`):
+                Whether to use the AdamW variant.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            max_unorm (`float`, defaults to 1.0):
+                The maximum gradient norm.
+        """
         super().__init__(
             "lamb",
             params,
@@ -56,6 +89,37 @@ def __init__(
         block_wise=False,
         max_unorm=1.0,
     ):
+        """
+        8-bit LAMB optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            bias_correction (`bool`, defaults to `True`):
+                Whether to apply bias correction to the first and second-order moments.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            adam_w_mode (`bool`, defaults to `True`):
+                Whether to use the AdamW variant.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            max_unorm (`float`, defaults to 1.0):
+                The maximum gradient norm.
+        """
         super().__init__(
             "lamb",
             params,
@@ -89,6 +153,37 @@ def __init__(
         block_wise=False,
         max_unorm=1.0,
     ):
+        """
+        32-bit LAMB optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-3):
+                The learning rate.
+            bias_correction (`bool`, defaults to `True`):
+                Whether to apply bias correction to the first and second-order moments.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            amsgrad (`bool`, defaults to `False`):
+                Whether to use the [AMSGrad](https://hf.co/papers/1904.09237) variant of Adam that uses the maximum of past squared gradients instead.
+            adam_w_mode (`bool`, defaults to `True`):
+                Whether to use the AdamW variant.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            max_unorm (`float`, defaults to 1.0):
+                The maximum gradient norm.
+        """
         super().__init__(
             "lamb",
             params,
diff --git a/bitsandbytes/optim/lars.py b/bitsandbytes/optim/lars.py
index 73554e3cc..7449b805b 100644
--- a/bitsandbytes/optim/lars.py
+++ b/bitsandbytes/optim/lars.py
@@ -23,6 +23,33 @@ def __init__(
         percentile_clipping=100,
         max_unorm=0.02,
     ):
+        """
+        Base LARS optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`):
+                The learning rate.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            dampening (`float`, defaults to 0):
+                The dampening value reduces the momentum of the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            nesterov (`bool`, defaults to `False`):
+                Whether to use Nesterov momentum.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            max_unorm (`float`, defaults to 0.02):
+                The maximum gradient norm.
+        """
         if momentum == 0:
             raise NotImplementedError(
                 "LARS without momentum is not supported!"
@@ -57,6 +84,31 @@ def __init__(
         percentile_clipping=100,
         max_unorm=0.02,
     ):
+        """
+        8-bit LARS optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`):
+                The learning rate.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            dampening (`float`, defaults to 0):
+                The dampening value reduces the momentum of the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            nesterov (`bool`, defaults to `False`):
+                Whether to use Nesterov momentum.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            max_unorm (`float`, defaults to 0.02):
+                The maximum gradient norm.
+        """
         if momentum == 0:
             raise NotImplementedError(
                 "LARS without momentum is not supported!"
@@ -91,6 +143,31 @@ def __init__(
         percentile_clipping=100,
         max_unorm=0.02,
     ):
+        """
+        32-bit LARS optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`):
+                The learning rate.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            dampening (`float`, defaults to 0):
+                The dampening value reduces the momentum of the optimizer.
+            weight_decay (`float`, defaults to 1e-2):
+                The weight decay value for the optimizer.
+            nesterov (`bool`, defaults to `False`):
+                Whether to use Nesterov momentum.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            max_unorm (`float`, defaults to 0.02):
+                The maximum gradient norm.
+        """
         if momentum == 0:
             raise NotImplementedError(
                 "LARS without momentum is not supported!"
diff --git a/bitsandbytes/optim/lion.py b/bitsandbytes/optim/lion.py
index b6ba4a9f1..ce185f863 100644
--- a/bitsandbytes/optim/lion.py
+++ b/bitsandbytes/optim/lion.py
@@ -7,25 +7,165 @@
 
 class Lion(Optimizer1State):
     def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        Base Lion optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-4):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            weight_decay (`float`, defaults to 0):
+                The weight decay value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged)
 
 class Lion8bit(Optimizer1State):
     def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        8-bit Lion optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-4):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            weight_decay (`float`, defaults to 0):
+                The weight decay value for the optimizer.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged)
 
 class Lion32bit(Optimizer1State):
     def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True, is_paged=False):
+        """
+        32-bit Lion optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-4):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            weight_decay (`float`, defaults to 0):
+                The weight decay value for the optimizer.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+            is_paged (`bool`, defaults to `False`):
+                Whether the optimizer is a paged optimizer or not.
+        """
         super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=is_paged)
 
 
 class PagedLion(Optimizer1State):
     def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, optim_bits=32, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        """
+        Paged Lion optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-4):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            weight_decay (`float`, defaults to 0):
+                The weight decay value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         super().__init__("lion", params, lr, betas, 0., weight_decay, optim_bits, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
 
 class PagedLion8bit(Optimizer1State):
     def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        """
+        Paged 8-bit Lion optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-4):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            weight_decay (`float`, defaults to 0):
+                The weight decay value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         super().__init__("lion", params, lr, betas, 0., weight_decay, 8, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
 
 class PagedLion32bit(Optimizer1State):
     def __init__(self, params, lr=1e-4, betas=(0.9, 0.99), weight_decay=0, args=None, min_8bit_size=4096, percentile_clipping=100, block_wise=True):
+        """
+        Paged 32-bit Lion optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-4):
+                The learning rate.
+            betas (`tuple(float, float)`, defaults to (0.9, 0.999)):
+                The beta values are the decay rates of the first and second-order moment of the optimizer.
+            weight_decay (`float`, defaults to 0):
+                The weight decay value for the optimizer.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         super().__init__("lion", params, lr, betas, 0., weight_decay, 32, args, min_8bit_size, percentile_clipping, block_wise, is_paged=True)
diff --git a/bitsandbytes/optim/optimizer.py b/bitsandbytes/optim/optimizer.py
index c21b3d7d3..a97afb026 100644
--- a/bitsandbytes/optim/optimizer.py
+++ b/bitsandbytes/optim/optimizer.py
@@ -59,8 +59,8 @@ def override_config(
         Override initial optimizer config with specific hyperparameters.
 
         The key-values of the optimizer config for the input parameters are overridden
-        This can be both, optimizer parameters like "betas" or "lr", or it can be
-        8-bit specific parameters like "optim_bits" or "percentile_clipping".
+        This can be both, optimizer parameters like `betas` or `lr`, or it can be
+        8-bit specific parameters like `optim_bits` or `percentile_clipping`.
 
         Arguments:
            parameters (`torch.Tensor` or `list(torch.Tensors)`):
@@ -160,7 +160,7 @@ def load_state_dict(self, state_dict):
         """Load an optimizer state.
 
         Arguments:
-            state_dict (`dict`): 
+            state_dict (`dict`):
                 An optimizer state (should be returned from a call to `state_dict`) to load.
         """
         # deepcopy, to be consistent with module API
@@ -272,7 +272,7 @@ def step(self, closure=None):
         """Perform a single optimization step.
 
         Arguments:
-            closure (`Callable`, *optional*, defaults to `None`): 
+            closure (`Callable`, *optional*, defaults to `None`):
                 A closure that reevaluates the model and returns the loss.
         """
         loss = None
@@ -648,7 +648,7 @@ def __init__(
             skip_zeros (`bool`, defaults to `False`):
                 Whether to skip zero values for sparse gradients and models to ensure correct updates.
             is_paged (`bool`, defaults to `False`):
-                Whether the optimizer is a paged optimizer or not.        
+                Whether the optimizer is a paged optimizer or not.
         """
         if not 0.0 <= lr:
             raise ValueError(f"Invalid learning rate: {lr}")
diff --git a/bitsandbytes/optim/rmsprop.py b/bitsandbytes/optim/rmsprop.py
index 2853ca723..ac371a66f 100644
--- a/bitsandbytes/optim/rmsprop.py
+++ b/bitsandbytes/optim/rmsprop.py
@@ -21,6 +21,35 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        Base RMSprop optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-2):
+                The learning rate.
+            alpha (`float`, defaults to 0.99):
+                The alpha value is the decay rate of the squared gradients of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            centered (`bool`, defaults to `False`):
+                Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         if alpha == 0:
             raise NotImplementedError(
                 "RMSprop with alpha==0.0 is not supported!"
@@ -57,6 +86,35 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        8-bit RMSprop optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-2):
+                The learning rate.
+            alpha (`float`, defaults to 0.99):
+                The alpha value is the decay rate of the squared gradients of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            centered (`bool`, defaults to `False`):
+                Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         if alpha == 0:
             raise NotImplementedError(
                 "RMSprop with alpha==0.0 is not supported!"
@@ -93,6 +151,35 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        32-bit RMSprop optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`, defaults to 1e-2):
+                The learning rate.
+            alpha (`float`, defaults to 0.99):
+                The alpha value is the decay rate of the squared gradients of the optimizer.
+            eps (`float`, defaults to 1e-8):
+                The epsilon value prevents division by zero in the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            centered (`bool`, defaults to `False`):
+                Whether the gradients are normalized by the variance. If `True`, it can help training at the expense of additional compute.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
 
         if alpha == 0:
             raise NotImplementedError(
diff --git a/bitsandbytes/optim/sgd.py b/bitsandbytes/optim/sgd.py
index 3c0fc2b9f..0f0b12e4b 100644
--- a/bitsandbytes/optim/sgd.py
+++ b/bitsandbytes/optim/sgd.py
@@ -20,6 +20,33 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        Base SGD optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`):
+                The learning rate.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            dampening (`float`, defaults to 0):
+                The dampening value reduces the momentum of the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            nesterov (`bool`, defaults to `False`):
+                Whether to use Nesterov momentum.
+            optim_bits (`int`, defaults to 32):
+                The number of bits of the optimizer state.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         if momentum == 0:
             raise NotImplementedError("SGD without momentum is not supported!")
         super().__init__(
@@ -51,6 +78,31 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        8-bit SGD optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`):
+                The learning rate.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            dampening (`float`, defaults to 0):
+                The dampening value reduces the momentum of the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            nesterov (`bool`, defaults to `False`):
+                Whether to use Nesterov momentum.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         if momentum == 0:
             raise NotImplementedError("SGD without momentum is not supported!")
         super().__init__(
@@ -82,6 +134,31 @@ def __init__(
         percentile_clipping=100,
         block_wise=True,
     ):
+        """
+        32-bit SGD optimizer.
+
+        Arguments:
+            params (`torch.tensor`):
+                The input parameters to optimize.
+            lr (`float`):
+                The learning rate.
+            momentum (`float`, defaults to 0):
+                The momentum value speeds up the optimizer by taking bigger steps.
+            dampening (`float`, defaults to 0):
+                The dampening value reduces the momentum of the optimizer.
+            weight_decay (`float`, defaults to 0.0):
+                The weight decay value for the optimizer.
+            nesterov (`bool`, defaults to `False`):
+                Whether to use Nesterov momentum.
+            args (`dict`, defaults to `None`):
+                A dictionary with additional arguments.
+            min_8bit_size (`int`, defaults to 4096):
+                The minimum number of elements of the parameter tensors for 8-bit optimization.
+            percentile_clipping (`int`, defaults to 100):
+                Adapts clipping threshold automatically by tracking the last 100 gradient norms and clipping the gradient at a certain percentile to improve stability.
+            block_wise (`bool`, defaults to `True`):
+                Whether to independently quantize each block of tensors to reduce outlier effects and improve stability.
+        """
         if momentum == 0:
             raise NotImplementedError("SGD without momentum is not supported!")
         super().__init__(
diff --git a/docs/source/reference/optim/adagrad.mdx b/docs/source/reference/optim/adagrad.mdx
index 7bc0f3040..8dddba04c 100644
--- a/docs/source/reference/optim/adagrad.mdx
+++ b/docs/source/reference/optim/adagrad.mdx
@@ -1,17 +1,18 @@
 # AdaGrad
 
-[AdaGrad (Adaptive Gradient)](https://jmlr.org/papers/v12/duchi11a.html) is an optimizer that adaptively adjusts the learning rate for each parameter based on their historical gradients.
+[AdaGrad (Adaptive Gradient)](https://jmlr.org/papers/v12/duchi11a.html) is an adaptive learning rate optimizer. AdaGrad stores a sum of the squared past gradients for each parameter and uses it to scale their learning rate. This allows the learning rate to be automatically lower or higher depending on the magnitude of the gradient, eliminating the need to manually tune the learning rate.
 
-* Parameters with larger gradients are updated with smaller learning rates to avoid overshooting the minimum. 
-* Parameters with smaller gradients are updated with larger learning rates to catch up and converge faster.
-
-Since learning rates are automatically adjusted, AdaGrad does not require manually tuning learning rates.
+## Adagrad[[api-class]]
 
 [[autodoc]] bitsandbytes.optim.Adagrad
     - __init__
 
+## Adagrad8bit
+
 [[autodoc]] bitsandbytes.optim.Adagrad8bit
     - __init__
 
+## Adagrad32bit
+
 [[autodoc]] bitsandbytes.optim.Adagrad32bit
     - __init__
diff --git a/docs/source/reference/optim/adam.mdx b/docs/source/reference/optim/adam.mdx
index 0bb12ca80..f367bc415 100644
--- a/docs/source/reference/optim/adam.mdx
+++ b/docs/source/reference/optim/adam.mdx
@@ -1,15 +1,38 @@
 # Adam
 
+[Adam (Adaptive moment estimation)](https://hf.co/papers/1412.6980) is an adaptive learning rate optimizer, combining ideas from [`SGD`] with momentum and [`RMSprop`] to automatically scale the learning rate:
+
+- a weighted average of the past gradients to provide direction (first-moment)
+- a weighted average of the *squared* past gradients to adapt the learning rate to each parameter (second-moment)
+
+bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted.
+
+## Adam[[api-class]]
+
 [[autodoc]] bitsandbytes.optim.Adam
+    - __init__
+
+## Adam8bit
 
 [[autodoc]] bitsandbytes.optim.Adam8bit
+    - __init__
+
+## Adam32bit
 
 [[autodoc]] bitsandbytes.optim.Adam32bit
+    - __init__
 
-## Paged Adam
+## PagedAdam
 
 [[autodoc]] bitsandbytes.optim.PagedAdam
+    - __init__
+
+## PagedAdam8bit
 
 [[autodoc]] bitsandbytes.optim.PagedAdam8bit
+    - __init__
+
+## PagedAdam32bit
 
 [[autodoc]] bitsandbytes.optim.PagedAdam32bit
+    - __init__
diff --git a/docs/source/reference/optim/adamw.mdx b/docs/source/reference/optim/adamw.mdx
index 9e85716df..e3dd410de 100644
--- a/docs/source/reference/optim/adamw.mdx
+++ b/docs/source/reference/optim/adamw.mdx
@@ -1,15 +1,34 @@
 # AdamW
 
+[AdamW](https://hf.co/papers/1711.05101) is a variant of the [`Adam`] optimizer that separates weight decay from the gradient update based on the observation that the weight decay formulation is different when applied to [`SGD`] and [`Adam`].
+
+bitsandbytes also supports paged optimizers which take advantage of CUDAs unified memory to transfer memory from the GPU to the CPU when GPU memory is exhausted.
+
+## AdamW[[api-class]]
+
 [[autodoc]] bitsandbytes.optim.AdamW
+    - __init__
+
+## AdamW8bit
 
 [[autodoc]] bitsandbytes.optim.AdamW8bit
+    - __init__
+
+## AdamW32bit
 
 [[autodoc]] bitsandbytes.optim.AdamW32bit
+    - __init__
 
-## Paged AdamW
+## PagedAdamW
 
 [[autodoc]] bitsandbytes.optim.PagedAdamW
+    - __init__
+## PagedAdamW8bit
 
 [[autodoc]] bitsandbytes.optim.PagedAdamW8bit
+    - __init__
+
+## PagedAdamW32bit
 
 [[autodoc]] bitsandbytes.optim.PagedAdamW32bit
+    - __init__
diff --git a/docs/source/reference/optim/lamb.mdx b/docs/source/reference/optim/lamb.mdx
index ab583ed91..d581380ed 100644
--- a/docs/source/reference/optim/lamb.mdx
+++ b/docs/source/reference/optim/lamb.mdx
@@ -1,7 +1,21 @@
 # LAMB
 
+[LAMB (Layerwise adaptive large batch optimization)](https://hf.co/papers/1904.00962) is an adaptive optimizer designed for training with large batch sizes to accelerate training, combining ideas from [`LARS`] and [`Adam`] to automatically scale the learning rate for each layer:
+
+- calculates a *trust ratio* between the weight and gradient norm in a layer and clips the ratio to prevent overly large or small updates
+- updates weights with the first and second-moments
+
+## LAMB[[api-class]]
+
 [[autodoc]] bitsandbytes.optim.LAMB
+    - __init__
+
+## LAMB8bit
 
 [[autodoc]] bitsandbytes.optim.LAMB8bit
+    - __init__
+
+## LAMB32bit
 
 [[autodoc]] bitsandbytes.optim.LAMB32bit
+    - __init__
diff --git a/docs/source/reference/optim/lars.mdx b/docs/source/reference/optim/lars.mdx
index b5dde29d0..93b5c55c3 100644
--- a/docs/source/reference/optim/lars.mdx
+++ b/docs/source/reference/optim/lars.mdx
@@ -1,7 +1,18 @@
 # LARS
 
+[LARS (Layer-wise Adaptive Rate Scaling)](https:/hf.co/papers/1708.03888) is an optimizer designed for training with large batch sizes to accelerate training. LARS uses a separate learning rate for each *layer* instead of each parameter. The learning rate is calculated from a *trust ratio* between the weight and gradient norm in a layer. This helps calibrate a stable update size.
+
+## LARS[[api-class]]
+
 [[autodoc]] bitsandbytes.optim.LARS
+    - __init__
+
+## LARS8bit
 
 [[autodoc]] bitsandbytes.optim.LARS8bit
+    - __init__
+
+## LARS32bit
 
 [[autodoc]] bitsandbytes.optim.LARS32bit
+    - __init__
diff --git a/docs/source/reference/optim/lion.mdx b/docs/source/reference/optim/lion.mdx
index a9f849a3e..8183c27e7 100644
--- a/docs/source/reference/optim/lion.mdx
+++ b/docs/source/reference/optim/lion.mdx
@@ -1,15 +1,33 @@
 # Lion
 
+[Lion (Evolved Sign Momentum)](https://hf.co/papers/2302.06675) is a unique optimizer that uses the sign of the gradient to determine the update direction of the momentum. This makes Lion more memory-efficient and faster than [`AdamW`] which tracks and store the first and second-order moments.
+
+## Lion[[api-class]]
+
 [[autodoc]] bitsandbytes.optim.Lion
+    - __init__
+
+## Lion8bit
 
 [[autodoc]] bitsandbytes.optim.Lion8bit
+    - __init__
+
+## Lion32bit
 
 [[autodoc]] bitsandbytes.optim.Lion32bit
+    - __init__
 
-## Paged Lion
+## PagedLion
 
 [[autodoc]] bitsandbytes.optim.PagedLion
+    - __init__
+
+## PagedLion8bit
 
 [[autodoc]] bitsandbytes.optim.PagedLion8bit
+    - __init__
+
+## PagedLion32bit
 
 [[autodoc]] bitsandbytes.optim.PagedLion32bit
+    - __init__
diff --git a/docs/source/reference/optim/optim_overview.mdx b/docs/source/reference/optim/optim_overview.mdx
index a4b5482a8..48e12b544 100644
--- a/docs/source/reference/optim/optim_overview.mdx
+++ b/docs/source/reference/optim/optim_overview.mdx
@@ -1,11 +1,21 @@
 # Overview
 
+[8-bit optimizers](https://hf.co/papers/2110.02861) reduce the memory footprint of 32-bit optimizers without any performance degradation which means you can train large models with many parameters faster. At the core of 8-bit optimizers is block-wise quantization which enables quantization accuracy, computational efficiency, and stability.
+
+bitsandbytes provides 8-bit optimizers through the base [`Optimizer8bit`] class, and additionally provides [`Optimizer2State`] and [`Optimizer1State`] for 2-state (for example, [`Adam`]) and 1-state (for example, [`Adagrad`]) optimizers respectively. To provide custom optimizer hyperparameters, use the [`GlobalOptimManager`] class to configure the optimizer.
+
+## Optimizer8bit
+
 [[autodoc]] bitsandbytes.optim.optimizer.Optimizer8bit
     - __init__
 
+## Optimizer2State
+
 [[autodoc]] bitsandbytes.optim.optimizer.Optimizer2State
     - __init__
 
+## Optimizer1State
+
 [[autodoc]] bitsandbytes.optim.optimizer.Optimizer1State
     - __init__
 
diff --git a/docs/source/reference/optim/rmsprop.mdx b/docs/source/reference/optim/rmsprop.mdx
index 2ecb7f579..33d839f6b 100644
--- a/docs/source/reference/optim/rmsprop.mdx
+++ b/docs/source/reference/optim/rmsprop.mdx
@@ -1,7 +1,15 @@
 # RMSprop
 
+RMSprop is an adaptive learning rate optimizer that is very similar to [`Adagrad`]. RMSprop stores a *weighted average* of the squared past gradients for each parameter and uses it to scale their learning rate. This allows the learning rate to be automatically lower or higher depending on the magnitude of the gradient, and it prevents the learning rate from diminishing.
+
+## RMSprop[[api-class]]
+
 [[autodoc]] bitsandbytes.optim.RMSprop
 
+## RMSprop8bit
+
 [[autodoc]] bitsandbytes.optim.RMSprop8bit
 
+## RMSprop32bit
+
 [[autodoc]] bitsandbytes.optim.RMSprop32bit
diff --git a/docs/source/reference/optim/sgd.mdx b/docs/source/reference/optim/sgd.mdx
index 3c24edcd2..a0d09d1e8 100644
--- a/docs/source/reference/optim/sgd.mdx
+++ b/docs/source/reference/optim/sgd.mdx
@@ -1,7 +1,20 @@
 # SGD
 
+Stochastic gradient descent (SGD) is a basic gradient descent optimizer to minimize loss given a set of model parameters and updates the parameters in the opposite direction of the gradient. The update is performed on a randomly sampled mini-batch of data from the dataset.
+
+bitsandbytes also supports momentum and Nesterov momentum to accelerate SGD by adding a weighted average of past gradients to the current gradient.
+
+## SGD[[api-class]]
+
 [[autodoc]] bitsandbytes.optim.SGD
+    - __init__
+
+## SGD8bit
 
 [[autodoc]] bitsandbytes.optim.SGD8bit
+    - __init__
+
+## SGD32bit
 
 [[autodoc]] bitsandbytes.optim.SGD32bit
+    - __init__

From ead726e43bdec070e42c558e9dcf82370de5fcbd Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Thu, 29 Feb 2024 10:26:36 -0800
Subject: [PATCH 11/14] more apis

---
 bitsandbytes/nn/modules.py              | 62 ++++++++++++++++---------
 docs/source/_toctree.yml                |  8 ++++
 docs/source/reference/nn/embeddings.mdx | 15 ++++++
 docs/source/reference/nn/linear4bit.mdx | 23 +++++++++
 docs/source/reference/nn/linear8bit.mdx | 13 ++++++
 5 files changed, 99 insertions(+), 22 deletions(-)
 create mode 100644 docs/source/reference/nn/embeddings.mdx
 create mode 100644 docs/source/reference/nn/linear4bit.mdx
 create mode 100644 docs/source/reference/nn/linear8bit.mdx

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index bd2bd5832..ac7bb8e7b 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -21,16 +21,7 @@
 
 class StableEmbedding(torch.nn.Embedding):
     """
-    Custom embedding layer designed for stable training in NLP tasks. The stable
-    embedding layer improves stability during optimization for models with word
-    embeddings, addressing issues related to the non-uniform distribution of input
-    tokens.
-
-    This stable embedding layer is initialized with Xavier uniform initialization,
-    followed by layer normalization. It is designed to support aggressive quantization,
-    addressing extreme gradient variations in non-uniform input distributions. The
-    stability of training is enhanced by using 32-bit optimizer states specifically
-    for this layer.
+    Custom embedding layer designed to improve stability during training for NLP tasks by using 32-bit optimizer states. It is designed to reduce gradient variations that can result from quantization. This embedding layer is initialized with Xavier uniform initialization followed by layer normalization.
 
     Example:
 
@@ -47,14 +38,11 @@ class StableEmbedding(torch.nn.Embedding):
     ```
 
     Attributes:
-        norm (torch.nn.LayerNorm): Layer normalization applied after the embedding.
+        norm (`torch.nn.LayerNorm`): Layer normalization applied after the embedding.
 
     Methods:
         reset_parameters(): Reset embedding parameters using Xavier uniform initialization.
         forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer.
-
-    Reference:
-        - [8-bit optimizer paper](https://arxiv.org/pdf/2110.02861.pdf)
     """
     def __init__(
         self,
@@ -71,14 +59,22 @@ def __init__(
     ) -> None:
         """
         Args:
-            num_embeddings (`int`): The number of unique embeddings (vocabulary size).
-            embedding_dim (`int`): The dimensionality of the embedding.
-            padding_idx (`Optional[int]`): If specified, pads the output with zeros at the given index.
-            max_norm (`Optional[float]`): If given, renormalizes embeddings to have a maximum L2 norm.
-            norm_type (`float`, defaults to `2.0`): The p-norm to compute for the max_norm option.
-            scale_grad_by_freq (`bool`): Scale gradient by frequency during backpropagation.
-            sparse (`bool`): If True, computes sparse gradients; False, computes dense gradients.
-            _weight (`Optional[Tensor]`): Pre-trained embeddings.
+            num_embeddings (`int`):
+                The number of unique embeddings (vocabulary size).
+            embedding_dim (`int`):
+                The dimensionality of the embedding.
+            padding_idx (`Optional[int]`):
+                Pads the output with zeros at the given index.
+            max_norm (`Optional[float]`):
+                Renormalizes embeddings to have a maximum L2 norm.
+            norm_type (`float`, defaults to `2.0`):
+                The p-norm to compute for the `max_norm` option.
+            scale_grad_by_freq (`bool`, defaults to `False`):
+                Scale gradient by frequency during backpropagation.
+            sparse (`bool`, defaults to `False`):
+                Computes dense gradients. Set to `True` to compute sparse gradients instead.
+            _weight (`Optional[Tensor]`):
+                Pretrained embeddings.
         """
         super().__init__(
             num_embeddings,
@@ -131,6 +127,9 @@ def forward(self, input: Tensor) -> Tensor:
 
 
 class Embedding(torch.nn.Embedding):
+    """
+    Embedding class to store and retrieve word embeddings from their indices.
+    """
     def __init__(
         self,
         num_embeddings: int,
@@ -143,6 +142,25 @@ def __init__(
         _weight: Optional[Tensor] = None,
         device: Optional[device] = None,
     ) -> None:
+        """
+        Args:
+            num_embeddings (`int`):
+                The number of unique embeddings (vocabulary size).
+            embedding_dim (`int`):
+                The dimensionality of the embedding.
+            padding_idx (`Optional[int]`):
+                Pads the output with zeros at the given index.
+            max_norm (`Optional[float]`):
+                Renormalizes embeddings to have a maximum L2 norm.
+            norm_type (`float`, defaults to `2.0`):
+                The p-norm to compute for the `max_norm` option.
+            scale_grad_by_freq (`bool`, defaults to `False`):
+                Scale gradient by frequency during backpropagation.
+            sparse (`bool`, defaults to `False`):
+                Computes dense gradients. Set to `True` to compute sparse gradients instead.
+            _weight (`Optional[Tensor]`):
+                Pretrained embeddings.
+        """
         super().__init__(
             num_embeddings,
             embedding_dim,
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 6db060286..87c4242de 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -48,3 +48,11 @@
       title: RMSprop
     - local: reference/optim/sgd
       title: SGD
+  - title: k-bit quantizers
+    sections:
+    - local: reference/nn/linear8bit
+      title: 8-bit quantizer
+    - local: reference/nn/linear4bit
+      title: 4-bit quantizer
+    - local: reference/nn/embeddings
+      title: Embedding
diff --git a/docs/source/reference/nn/embeddings.mdx b/docs/source/reference/nn/embeddings.mdx
new file mode 100644
index 000000000..e725ecb17
--- /dev/null
+++ b/docs/source/reference/nn/embeddings.mdx
@@ -0,0 +1,15 @@
+# Embedding
+
+The embedding class is used to store and retrieve word embeddings from their indices. There are two types of embeddings in bitsandbytes, the standard PyTorch [`Embedding`] class and the [`StableEmbedding`] class.
+
+The [`StableEmbedding`] class was introduced in the [8-bit Optimizers via Block-wise Quantization](https://hf.co/papers/2110.02861) paper to reduce gradient variance as a result of the non-uniform distribution of input tokens. This class is designed to support quantization.
+
+## Embedding
+
+[[autodoc]] bitsandbytes.nn.Embedding
+    - __init__
+
+## StableEmbedding
+
+[[autodoc]] bitsandbytes.nn.StableEmbedding
+    - __init__
diff --git a/docs/source/reference/nn/linear4bit.mdx b/docs/source/reference/nn/linear4bit.mdx
new file mode 100644
index 000000000..88aec707d
--- /dev/null
+++ b/docs/source/reference/nn/linear4bit.mdx
@@ -0,0 +1,23 @@
+# 4-bit quantization
+
+[QLoRA](https://hf.co/papers/2305.14314) is a finetuning method that quantizes a model to 4-bits and adds a set of low-rank adaptation (LoRA) weights to the model and tuning them through the quantized weights. This method also introduces a new data type, 4-bit NormalFloat (`LinearNF4`) in addition to the standard Float4 data type (`LinearFP4`). `LinearNF4` is adapted for weights initialized from a normal distribution and can improve performance.
+
+## Linear4bit
+
+[[autodoc]] bitsandbytes.nn.Linear4bit
+    - __init__
+
+## LinearFP4
+
+[[autdodoc]] bitsandbytes.nn.LinearFP4
+    - __init__
+
+## LinearNF4
+
+[[autodoc]] bitsandbytes.nn.LinearNF4
+    - __init__
+
+## Params4bit
+
+[[autodoc]] bitsandbytes.nn.Params4bit
+    - __init__
diff --git a/docs/source/reference/nn/linear8bit.mdx b/docs/source/reference/nn/linear8bit.mdx
new file mode 100644
index 000000000..73254fe67
--- /dev/null
+++ b/docs/source/reference/nn/linear8bit.mdx
@@ -0,0 +1,13 @@
+# 8-bit quantization
+
+[LLM.int8()](https://hf.co/papers/2208.07339) is a quantization method that doesn't degrade performance which makes large model inference more accessible. The key is to extract the outliers from the inputs and weights and multiply them in 16-bit. All other values are multiplied in 8-bit and quantized to Int8 before being dequantized back to 16-bits. The outputs from the 16-bit and 8-bit multiplication are combined to produce the final output.
+
+## Linear8bitLt
+
+[[autodoc]] bitsandbytes.nn.Linear8bitLt
+    - __init__
+
+## Int8Params
+
+[[autodoc]] bitsandbytes.nn.Int8Params
+    - __init__

From 4716ff5ddafa9b57be4d4f91dab7796067870fe9 Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Thu, 29 Feb 2024 10:41:44 -0800
Subject: [PATCH 12/14] params

---
 bitsandbytes/nn/modules.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index ac7bb8e7b..0dac35bf1 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -434,7 +434,19 @@ def forward(self, x: torch.Tensor):
 
 
 class LinearFP4(Linear4bit):
+    """
+    Implements the FP4 data type.
+    """
     def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None):
+        """
+        Args:
+            input_features (`str`):
+                Number of input features of the linear layer.
+            output_features (`str`):
+                Number of output features of the linear layer.
+            bias (`bool`, defaults to `True`):
+                Whether the linear class uses the bias term as well.
+        """
         super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'fp4', quant_storage, device)
 
 
@@ -450,6 +462,15 @@ class LinearNF4(Linear4bit):
         the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
     '''
     def __init__(self, input_features, output_features, bias=True, compute_dtype=None, compress_statistics=True, quant_storage=torch.uint8, device=None):
+        """
+        Args:
+            input_features (`str`):
+                Number of input features of the linear layer.
+            output_features (`str`):
+                Number of output features of the linear layer.
+            bias (`bool`, defaults to `True`):
+                Whether the linear class uses the bias term as well.
+        """
         super().__init__(input_features, output_features, bias, compute_dtype, compress_statistics, 'nf4', quant_storage, device)
 
 

From 2f51d57a9289cb656c70e4231ea7b82f4e532e6d Mon Sep 17 00:00:00 2001
From: Steven Liu <steven.liu@huggingface.co>
Date: Tue, 5 Mar 2024 13:26:51 -0800
Subject: [PATCH 13/14] clarify

---
 docs/source/reference/nn/linear4bit.mdx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/reference/nn/linear4bit.mdx b/docs/source/reference/nn/linear4bit.mdx
index 88aec707d..3cbf6509d 100644
--- a/docs/source/reference/nn/linear4bit.mdx
+++ b/docs/source/reference/nn/linear4bit.mdx
@@ -1,6 +1,6 @@
 # 4-bit quantization
 
-[QLoRA](https://hf.co/papers/2305.14314) is a finetuning method that quantizes a model to 4-bits and adds a set of low-rank adaptation (LoRA) weights to the model and tuning them through the quantized weights. This method also introduces a new data type, 4-bit NormalFloat (`LinearNF4`) in addition to the standard Float4 data type (`LinearFP4`). `LinearNF4` is adapted for weights initialized from a normal distribution and can improve performance.
+[QLoRA](https://hf.co/papers/2305.14314) is a finetuning method that quantizes a model to 4-bits and adds a set of low-rank adaptation (LoRA) weights to the model and tuning them through the quantized weights. This method also introduces a new data type, 4-bit NormalFloat (`LinearNF4`) in addition to the standard Float4 data type (`LinearFP4`). `LinearNF4` is a quantization data type for normally distributed data and can improve performance.
 
 ## Linear4bit
 

From d891c597b04f0622def7212a2a2ddda3c6ace9d3 Mon Sep 17 00:00:00 2001
From: Titus von Koeller <9048635+Titus-von-Koeller@users.noreply.github.com>
Date: Thu, 7 Mar 2024 20:56:10 +0000
Subject: [PATCH 14/14] run pre-commit hooks

---
 .git-blame-ignore-revs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index c0386dc9f..fc44037d8 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -8,4 +8,4 @@ ea7c14f8ef64924f2d0ff80df3cdabf2c7299848
 7727fa4c8c6c1ef2b109120aff4196a0a6bf3ed6
 
 # format tests/linear_4bit.py
-34735ba89de8235ea9da6ef409f814dcea9e2038
\ No newline at end of file
+34735ba89de8235ea9da6ef409f814dcea9e2038