huggingface · stas00 · Jan 18, 2022 · Jan 28, 2022 · Jan 28, 2022 · Jan 28, 2022
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -1,4 +1,4 @@
-- sections: 
+- sections:
   - local: index
     title: 🤗 Transformers
   - local: quicktour
@@ -63,6 +63,20 @@
     title: 'Performance and Scalability: How To Fit a Bigger Model and Train It Faster'
   - local: parallelism
     title: Model Parallelism
+  - local: perf_infer
+    title: Performance - Inference
+  - local: perf_infer_gpu_one
+    title: Performance - Inference on one GPU
+  - local: perf_infer_gpu_many
+    title: Performance - Inference on many GPUs
+  - local: perf_infer_cpu
+    title: Performance - Inference on CPU
+  - local: perf_train
+    title: Performance - Training
+  - local: perf_train_gpu_one
+    title: Performance - Training on one GPU
+  - local: perf_train_gpu_many
+    title: Performance - Training on many GPUs
   - local: testing
     title: Testing
   - local: debugging

diff --git a/docs/source/perf_infer.mdx b/docs/source/perf_infer.mdx
@@ -0,0 +1,22 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
+
+# Efficient Inference
+
+## Memory Needs During Inference
+
+4-6x params
+
+## Choose Your Scale
+
+- [One GPU](perf_infer_gpu_one)
+- [Many GPUs](perf_infer_gpu_many)
+- [CPU](perf_infer_cpu)
diff --git a/docs/source/perf_infer_cpu.mdx b/docs/source/perf_infer_cpu.mdx
@@ -0,0 +1,30 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
+
+# Efficient Inference on CPU
+
+
+## Less Memory
+
+
+
+## Faster Speed
+
+
+
+
+## Scalability Strategy
+
+* Deepspeed-ZeRO Stage 3 + CPU/NVMe Offload
+
+* Sagemaker
+
+* Deepspeed-Inference
diff --git a/docs/source/perf_infer_gpu_many.mdx b/docs/source/perf_infer_gpu_many.mdx
@@ -0,0 +1,47 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
+
+# Efficient Inference on Multiple GPUs
+
+
+## Less Memory
+
+### fp16
+
+### bf16
+
+### Quantization
+
+
+
+## Faster Speed
+
+### DP vs DDP
+
+### ONNX
+
+### Infinity, Inference API
+
+
+
+
+
+## Scalability Strategy
+
+* Deepspeed-ZeRO Stage 3 + CPU/NVMe Offload
+
+* Sagemaker
+
+* Deepspeed-Inference
+
+
+
+## Hardware
diff --git a/docs/source/perf_infer_gpu_one.mdx b/docs/source/perf_infer_gpu_one.mdx
@@ -0,0 +1,44 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
+
+# Efficient Inference on a Single GPU
+
+
+
+## Less Memory
+
+### fp16
+
+### bf16
+
+### Quantization
+
+
+
+
+
+## Faster Speed
+
+### Batch sizes
+
+### ONNX
+
+### Infinity, Inference API
+
+
+
+## Scalability Strategy
+
+* Deepspeed-ZeRO Stage 3 + CPU/NVMe Offload
+
+* Sagemaker
+
+* Deepspeed-Inference
diff --git a/docs/source/perf_train.mdx b/docs/source/perf_train.mdx
@@ -0,0 +1,23 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
+
+# Efficient Training
+
+
+
+## Memory Needs During Training
+
+16-18x number of model params
+
+## Choose Your Scale
+
+- [One GPU](perf_train_gpu_one)
+- [Many GPUs](perf_train_gpu_many)
diff --git a/docs/source/perf_train_gpu_many.mdx b/docs/source/perf_train_gpu_many.mdx
@@ -0,0 +1,80 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+-->
+
+# Efficient Training on Multiple GPU
+
+
+
+## Less Memory
+
+
+### fp16
+
+### bf16
+
+### Gradient Accumulation
+
+### Gradient Checkpointing
+
+### Optimizer
+
+
+## Faster Speed
+
+### DP vs DDP
+
+### Gradient Accumulation
+
+### Batch sizes
+
+
+
+## Scalability Strategy
+
+**⇨ Single Node / Multi-GPU**
+
+* Model fits onto a single GPU:
+
+    1. DDP - Distributed DP
+    2. ZeRO - may or may not be faster depending on the situation and configuration used
+
+* Model doesn't fit onto a single GPU:
+
+    1. PP
+    2. ZeRO
+    3. TP
+
+    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+
+    TP is almost always used within a single node. That is TP size <= gpus per node.
+
+* Largest Layer not fitting into a single GPU:
+
+    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
+    2. With ZeRO see the same entry for "Single GPU" above
+
+
+**⇨ Multi-Node / Multi-GPU**
+
+* When you have fast inter-node connectivity:
+
+    1. ZeRO - as it requires close to no modifications to the model
+    2. PP+TP+DP - less communications, but requires massive changes to the model
+
+* when you have slow inter-node connectivity and still low on GPU memory:
+
+    1. DP+PP+TP+ZeRO-1
+
+
+
+
+
+## Hardware