From d73e9f88a1b19558596ebe27315e99345aa4d7b3 Mon Sep 17 00:00:00 2001 From: Avik Pal Date: Mon, 23 Dec 2024 03:49:39 -0500 Subject: [PATCH] feat: working ConvMixer --- examples/CIFAR10/README.md | 34 ---------------------------------- examples/CIFAR10/conv_mixer.jl | 6 ++++-- 2 files changed, 4 insertions(+), 36 deletions(-) diff --git a/examples/CIFAR10/README.md b/examples/CIFAR10/README.md index 6e1841663..dea9cfc3d 100644 --- a/examples/CIFAR10/README.md +++ b/examples/CIFAR10/README.md @@ -45,44 +45,10 @@ julia --startup-file=no \ --project=. \ --threads=auto \ conv_mixer.jl \ - --lr-max=0.05 \ - --weight-decay=0.0001 \ --backend=reactant ``` -Here's an example output of the above command (on a RTX 4050 6GB Laptop GPU): - -``` -Epoch 1: Learning Rate 5.05e-03, Train Acc: 56.91%, Test Acc: 56.49%, Time: 129.84 -Epoch 2: Learning Rate 1.01e-02, Train Acc: 69.75%, Test Acc: 68.40%, Time: 21.22 -Epoch 3: Learning Rate 1.51e-02, Train Acc: 76.86%, Test Acc: 74.73%, Time: 21.33 -Epoch 4: Learning Rate 2.01e-02, Train Acc: 81.03%, Test Acc: 78.14%, Time: 21.40 -Epoch 5: Learning Rate 2.51e-02, Train Acc: 72.71%, Test Acc: 70.29%, Time: 21.34 -Epoch 6: Learning Rate 3.01e-02, Train Acc: 83.12%, Test Acc: 80.20%, Time: 21.38 -Epoch 7: Learning Rate 3.51e-02, Train Acc: 82.38%, Test Acc: 78.66%, Time: 21.39 -Epoch 8: Learning Rate 4.01e-02, Train Acc: 84.24%, Test Acc: 79.97%, Time: 21.49 -Epoch 9: Learning Rate 4.51e-02, Train Acc: 84.93%, Test Acc: 80.18%, Time: 21.40 -Epoch 10: Learning Rate 5.00e-02, Train Acc: 84.97%, Test Acc: 80.26%, Time: 21.37 -Epoch 11: Learning Rate 4.52e-02, Train Acc: 89.09%, Test Acc: 83.53%, Time: 21.31 -Epoch 12: Learning Rate 4.05e-02, Train Acc: 91.62%, Test Acc: 85.10%, Time: 21.39 -Epoch 13: Learning Rate 3.57e-02, Train Acc: 93.71%, Test Acc: 86.78%, Time: 21.29 -Epoch 14: Learning Rate 3.10e-02, Train Acc: 95.14%, Test Acc: 87.23%, Time: 21.37 -Epoch 15: Learning Rate 2.62e-02, Train Acc: 95.36%, Test Acc: 87.08%, Time: 21.34 -Epoch 16: Learning Rate 2.15e-02, Train Acc: 97.07%, Test Acc: 87.91%, Time: 21.26 -Epoch 17: Learning Rate 1.67e-02, Train Acc: 98.67%, Test Acc: 89.57%, Time: 21.40 -Epoch 18: Learning Rate 1.20e-02, Train Acc: 99.41%, Test Acc: 89.77%, Time: 21.28 -Epoch 19: Learning Rate 7.20e-03, Train Acc: 99.81%, Test Acc: 90.31%, Time: 21.39 -Epoch 20: Learning Rate 2.50e-03, Train Acc: 99.94%, Test Acc: 90.83%, Time: 21.44 -Epoch 21: Learning Rate 2.08e-03, Train Acc: 99.96%, Test Acc: 90.83%, Time: 21.23 -Epoch 22: Learning Rate 1.66e-03, Train Acc: 99.97%, Test Acc: 90.91%, Time: 21.29 -Epoch 23: Learning Rate 1.25e-03, Train Acc: 99.99%, Test Acc: 90.82%, Time: 21.29 -Epoch 24: Learning Rate 8.29e-04, Train Acc: 99.99%, Test Acc: 90.79%, Time: 21.32 -Epoch 25: Learning Rate 4.12e-04, Train Acc: 100.00%, Test Acc: 90.83%, Time: 21.32 -``` - ### Notes 1. To match the results from the original repo, we need more augmentation strategies, that are currently not implemented in DataAugmentation.jl. - 2. Don't compare the reported timings in that repo against the numbers here. They time the - entire loop. We only time the training part of the loop. diff --git a/examples/CIFAR10/conv_mixer.jl b/examples/CIFAR10/conv_mixer.jl index 55f0b20da..170b11910 100644 --- a/examples/CIFAR10/conv_mixer.jl +++ b/examples/CIFAR10/conv_mixer.jl @@ -33,10 +33,12 @@ function ConvMixer(; dim, depth, kernel_size=5, patch_size=2) #! format: on end -Comonicon.@main function main(; batchsize::Int=512, hidden_dim::Int=256, depth::Int=8, +Comonicon.@main function main(; + batchsize::Int=512, hidden_dim::Int=256, depth::Int=8, patch_size::Int=2, kernel_size::Int=5, weight_decay::Float64=0.0001, clip_norm::Bool=false, seed::Int=1234, epochs::Int=25, lr_max::Float64=0.05, - backend::String="reactant") + backend::String="reactant" +) model = ConvMixer(; dim=hidden_dim, depth, kernel_size, patch_size) opt = AdamW(; eta=lr_max, lambda=weight_decay)