From 168bd98ccd1f954342d98f3802fd0b0a42d6e555 Mon Sep 17 00:00:00 2001 From: anton083 Date: Sat, 23 Mar 2024 20:43:55 +0100 Subject: [PATCH] Edit documentation --- README.md | 8 +++++ docs/make.jl | 3 ++ docs/src/index.md | 79 ++++++++++++++++++++++++++++++++++++++++--- docs/src/reference.md | 5 +++ 4 files changed, 90 insertions(+), 5 deletions(-) create mode 100644 docs/src/reference.md diff --git a/README.md b/README.md index 2933237..4bf30e7 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,14 @@ Conflux.jl is a toolkit designed to enable data parallelism for [Flux.jl](https: See the documentation for more details, examples, and important caveats. +## Installation + +The package can be installed with the Julia package manager. From the Julia REPL, type `]` to enter the Pkg REPL mode and run: + +```julia +pkg> add https://github.com/MurrellGroup/Conflux.jl#main +``` + ## Example usage ```julia diff --git a/docs/make.jl b/docs/make.jl index 55ab2f4..c36d997 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -7,13 +7,16 @@ makedocs(; modules=[Conflux], authors="Anton Oresten ", sitename="Conflux.jl", + doctest=false, format=Documenter.HTML(; edit_link="main", assets=String[], ), pages=[ "Home" => "index.md", + "API Reference" => "API.md", ], + checkdocs=:all, ) deploydocs( diff --git a/docs/src/index.md b/docs/src/index.md index d770892..a20046c 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -4,11 +4,80 @@ CurrentModule = Conflux # Conflux -Documentation for [Conflux](https://github.com/anton083/Conflux.jl). +[![Latest Release](https://img.shields.io/github/release/MurrellGroup/Conflux.jl.svg)](https://github.com/MurrellGroup/Conflux.jl/releases/latest) +[![MIT license](https://img.shields.io/badge/license-MIT-green.svg)](https://opensource.org/license/MIT) +[![Documentation](https://img.shields.io/badge/docs-stable-blue.svg)](https://MurrellGroup.github.io/Conflux.jl/stable/) +[![Documentation](https://img.shields.io/badge/docs-latest-blue.svg)](https://MurrellGroup.github.io/Conflux.jl/dev/) +[![Status](https://github.com/MurrellGroup/Conflux.jl/actions/workflows/CI.yml/badge.svg?branch=main)](https://github.com/MurrellGroup/Conflux.jl/actions/workflows/CI.yml?query=branch%3Amain) +[![Coverage](https://codecov.io/gh/MurrellGroup/Conflux.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/MurrellGroup/Conflux.jl) -```@index -``` +Conflux.jl is a toolkit designed to enable data parallelism for [Flux.jl](https://github.com/FluxML/Flux.jl) models by simplifying the process of replicating them across multiple GPUs on a single node, and by leveraging [NCCL.jl](https://github.com/JuliaGPU/NCCL.jl) for efficient inter-GPU communication. This package aims to provide a straightforward and intuitive interface for multi-GPU training, requiring minimal changes to existing code and training loops. + +## Features + +- Easy replication of objects across multiple GPUs with the **replicate** function +- Efficient synchronization of models and averaging of gradients with the **allreduce!** function, which takes an operator (e.g. `+`, `*`, `avg`) and a set of replicas, and reduces all their parameters with the given operator, leaving the replicas identical. +- A **withdevices** function that allows you to run code on each device asynchronously. + +See the documentation for more details, examples, and important caveats. + +## Installation -```@autodocs -Modules = [Conflux] +The package can be installed with the Julia package manager. From the Julia REPL, type `]` to enter the Pkg REPL mode and run: + +```julia +pkg> add https://github.com/MurrellGroup/Conflux.jl#main ``` + +## Example usage + +```julia +# Specify the default devices to use +ENV["CUDA_VISIBLE_DEVICES"] = "0,1" + +using Conflux + +using Flux, Optimisers + +model = Chain(Dense(1 => 256, tanh), Dense(256 => 512, tanh), Dense(512 => 256, tanh), Dense(256 => 1)) + +# This will use the available devices. If you want to use a specific device, you can pass them in a second argument. +models = replicate(model) + +opt = Optimisers.Adam(0.001f0) + +# Instantiate the optimiser states on each device +states = Conflux.withdevices() do (i, device) + Optimisers.setup(opt, model) |> device +end + +# A single batch, stored on CPU. Could use a more sophisticated mechanism to distribute multiple batches. +X = rand(1, 16) +Y = X .^ 2 + +loss(y, Y) = sum(abs2, y .- Y) + +losses = [] +for epoch in 1:10 + # Get the gradients for each batch on each device + ∇models = Conflux.withdevices() do (i, device) + x, y = device(X), device(Y) + # The second return value is a tuple because `Flux.withgradient` takes `args...`, and the model is the first argument. + l, (∇model,) = Flux.withgradient(m -> loss(m(x), y), models[i]) + push!(losses, l) + ∇model + end + + # Average the gradients across devices + allreduce!(avg, ∇models...) + + # Update the models on each device + Conflux.withdevices() do (i, device) + Optimisers.update!(states[i], models[i], ∇models[i]) + end + + # Optionally synchronize the models and optimiser states, in case the parameters diverge + #allreduce!(avg, models...) + #allreduce!(avg, states...) +end +``` \ No newline at end of file diff --git a/docs/src/reference.md b/docs/src/reference.md new file mode 100644 index 0000000..449ece5 --- /dev/null +++ b/docs/src/reference.md @@ -0,0 +1,5 @@ +# Reference + +```@autodocs +Modules = [Conflux] +``` \ No newline at end of file