-
Notifications
You must be signed in to change notification settings - Fork 232
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
9 changed files
with
210 additions
and
47 deletions.
There are no files selected for viewing
15 changes: 15 additions & 0 deletions
15
examples/huggingface/results/mistral_use_liger_False_patching_type_None.log
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
********** No Patching *********** | ||
********** No Patching *********** | ||
********** No Patching *********** | ||
********** No Patching *********** | ||
{'loss': 1.9423, 'grad_norm': 102.73744201660156, 'learning_rate': 6e-06, 'epoch': 0.0, 'num_input_tokens_seen': 40960} | ||
{'loss': 1.9385, 'grad_norm': 104.32758331298828, 'learning_rate': 5.819077862357725e-06, 'epoch': 0.01, 'num_input_tokens_seen': 78336, 'step': 2, 'step_time_sec': 2.96, 'avg_step_time_sec': 2.96, 'time_to_completion_sec': 23.68, 'estimated_total_time_sec': 29.6, 'step_peak_memory_allocated_MB': 34547.76, 'step_peak_memory_reserved_MB': 46892.0, 'total_peak_memory_allocated_MB': 34547.76, 'total_peak_memory_reserved_MB': 46892.0, 'step_tokens_per_second': 12625.64, 'avg_tokens_per_second': 12625.64} | ||
{'loss': 1.1825, 'grad_norm': 52.640846252441406, 'learning_rate': 5.298133329356934e-06, 'epoch': 0.01, 'num_input_tokens_seen': 118784, 'step': 3, 'step_time_sec': 3.72, 'avg_step_time_sec': 3.34, 'time_to_completion_sec': 23.36, 'estimated_total_time_sec': 33.38, 'step_peak_memory_allocated_MB': 34547.83, 'step_peak_memory_reserved_MB': 51576.0, 'total_peak_memory_allocated_MB': 34547.83, 'total_peak_memory_reserved_MB': 51576.0, 'step_tokens_per_second': 10887.47, 'avg_tokens_per_second': 11658.29} | ||
{'loss': 1.1446, 'grad_norm': 67.16675567626953, 'learning_rate': 4.5e-06, 'epoch': 0.01, 'num_input_tokens_seen': 163328, 'step': 4, 'step_time_sec': 4.63, 'avg_step_time_sec': 3.77, 'time_to_completion_sec': 22.6, 'estimated_total_time_sec': 37.67, 'step_peak_memory_allocated_MB': 34547.91, 'step_peak_memory_reserved_MB': 57572.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 57572.0, 'step_tokens_per_second': 9630.35, 'avg_tokens_per_second': 10828.26} | ||
{'loss': 1.0803, 'grad_norm': 21.94203758239746, 'learning_rate': 3.5209445330007917e-06, 'epoch': 0.01, 'num_input_tokens_seen': 196608, 'step': 5, 'step_time_sec': 2.89, 'avg_step_time_sec': 3.55, 'time_to_completion_sec': 17.74, 'estimated_total_time_sec': 35.48, 'step_peak_memory_allocated_MB': 34547.75, 'step_peak_memory_reserved_MB': 57572.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 57572.0, 'step_tokens_per_second': 11518.67, 'avg_tokens_per_second': 10968.83} | ||
{'loss': 1.098, 'grad_norm': 24.288616180419922, 'learning_rate': 2.4790554669992093e-06, 'epoch': 0.02, 'num_input_tokens_seen': 244736, 'step': 6, 'step_time_sec': 4.63, 'avg_step_time_sec': 3.76, 'time_to_completion_sec': 15.06, 'estimated_total_time_sec': 37.64, 'step_peak_memory_allocated_MB': 34547.72, 'step_peak_memory_reserved_MB': 58180.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 58180.0, 'step_tokens_per_second': 10393.91, 'avg_tokens_per_second': 10827.38} | ||
{'loss': 1.0376, 'grad_norm': 53.7581672668457, 'learning_rate': 1.5000000000000007e-06, 'epoch': 0.02, 'num_input_tokens_seen': 283648, 'step': 7, 'step_time_sec': 3.31, 'avg_step_time_sec': 3.69, 'time_to_completion_sec': 11.06, 'estimated_total_time_sec': 36.88, 'step_peak_memory_allocated_MB': 34547.76, 'step_peak_memory_reserved_MB': 58180.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 58180.0, 'step_tokens_per_second': 11766.3, 'avg_tokens_per_second': 10967.71} | ||
{'loss': 0.9906, 'grad_norm': 10.769705772399902, 'learning_rate': 7.018666706430663e-07, 'epoch': 0.02, 'num_input_tokens_seen': 324608, 'step': 8, 'step_time_sec': 3.88, 'avg_step_time_sec': 3.72, 'time_to_completion_sec': 7.43, 'estimated_total_time_sec': 37.16, 'step_peak_memory_allocated_MB': 34547.84, 'step_peak_memory_reserved_MB': 58180.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 58180.0, 'step_tokens_per_second': 10549.53, 'avg_tokens_per_second': 10905.29} | ||
{'loss': 0.9546, 'grad_norm': 7.883011341094971, 'learning_rate': 1.8092213764227505e-07, 'epoch': 0.02, 'num_input_tokens_seen': 362496, 'step': 9, 'step_time_sec': 3.42, 'avg_step_time_sec': 3.68, 'time_to_completion_sec': 3.68, 'estimated_total_time_sec': 36.78, 'step_peak_memory_allocated_MB': 34547.68, 'step_peak_memory_reserved_MB': 58180.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 58180.0, 'step_tokens_per_second': 11086.94, 'avg_tokens_per_second': 10926.38} | ||
{'loss': 0.9645, 'grad_norm': 7.525882720947266, 'learning_rate': 0.0, 'epoch': 0.03, 'num_input_tokens_seen': 396800, 'step': 10, 'step_time_sec': 2.99, 'avg_step_time_sec': 3.6, 'time_to_completion_sec': 0.0, 'estimated_total_time_sec': 36.02, 'step_peak_memory_allocated_MB': 34547.69, 'step_peak_memory_reserved_MB': 58180.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 58180.0, 'step_tokens_per_second': 11473.59, 'avg_tokens_per_second': 10976.85} | ||
{'train_runtime': 38.5858, 'train_samples_per_second': 33.173, 'train_steps_per_second': 0.259, 'train_loss': 1.2333564937114716, 'epoch': 0.03, 'num_input_tokens_seen': 396800, 'step': 10, 'step_time_sec': 2.99, 'avg_step_time_sec': 3.6, 'time_to_completion_sec': 0.0, 'estimated_total_time_sec': 36.02, 'step_peak_memory_allocated_MB': 34547.69, 'step_peak_memory_reserved_MB': 58180.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 58180.0, 'step_tokens_per_second': 11473.59, 'avg_tokens_per_second': 10976.85} |
15 changes: 15 additions & 0 deletions
15
examples/huggingface/results/mistral_use_liger_True_patching_type_post_init_class.log
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
********** Post-Init Class Patching *********** | ||
********** Post-Init Class Patching *********** | ||
********** Post-Init Class Patching *********** | ||
********** Post-Init Class Patching *********** | ||
{'loss': 2.0133, 'grad_norm': 111.51253509521484, 'learning_rate': 6e-06, 'epoch': 0.0, 'num_input_tokens_seen': 42496} | ||
{'loss': 1.9944, 'grad_norm': 110.4891128540039, 'learning_rate': 5.819077862357725e-06, 'epoch': 0.01, 'num_input_tokens_seen': 73728, 'step': 2, 'step_time_sec': 2.62, 'avg_step_time_sec': 2.62, 'time_to_completion_sec': 20.95, 'estimated_total_time_sec': 26.18, 'step_peak_memory_allocated_MB': 34547.75, 'step_peak_memory_reserved_MB': 45144.0, 'total_peak_memory_allocated_MB': 34547.75, 'total_peak_memory_reserved_MB': 45144.0, 'step_tokens_per_second': 11927.48, 'avg_tokens_per_second': 11927.48} | ||
{'loss': 1.193, 'grad_norm': 53.41102981567383, 'learning_rate': 5.298133329356934e-06, 'epoch': 0.01, 'num_input_tokens_seen': 107008, 'step': 3, 'step_time_sec': 3.14, 'avg_step_time_sec': 2.88, 'time_to_completion_sec': 20.16, 'estimated_total_time_sec': 28.79, 'step_peak_memory_allocated_MB': 34547.7, 'step_peak_memory_reserved_MB': 45860.0, 'total_peak_memory_allocated_MB': 34547.75, 'total_peak_memory_reserved_MB': 45860.0, 'step_tokens_per_second': 10598.12, 'avg_tokens_per_second': 11202.59} | ||
{'loss': 1.1854, 'grad_norm': 66.05502319335938, 'learning_rate': 4.5e-06, 'epoch': 0.01, 'num_input_tokens_seen': 142848, 'step': 4, 'step_time_sec': 3.37, 'avg_step_time_sec': 3.04, 'time_to_completion_sec': 18.26, 'estimated_total_time_sec': 30.43, 'step_peak_memory_allocated_MB': 34547.82, 'step_peak_memory_reserved_MB': 48548.0, 'total_peak_memory_allocated_MB': 34547.82, 'total_peak_memory_reserved_MB': 48548.0, 'step_tokens_per_second': 10630.55, 'avg_tokens_per_second': 10991.35} | ||
{'loss': 1.1145, 'grad_norm': 19.789567947387695, 'learning_rate': 3.5209445330007917e-06, 'epoch': 0.01, 'num_input_tokens_seen': 187392, 'step': 5, 'step_time_sec': 4.45, 'avg_step_time_sec': 3.39, 'time_to_completion_sec': 16.97, 'estimated_total_time_sec': 33.94, 'step_peak_memory_allocated_MB': 34547.91, 'step_peak_memory_reserved_MB': 52132.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 52132.0, 'step_tokens_per_second': 10014.59, 'avg_tokens_per_second': 10671.38} | ||
{'loss': 1.0048, 'grad_norm': 20.529048919677734, 'learning_rate': 2.4790554669992093e-06, 'epoch': 0.02, 'num_input_tokens_seen': 224768, 'step': 6, 'step_time_sec': 3.27, 'avg_step_time_sec': 3.37, 'time_to_completion_sec': 13.48, 'estimated_total_time_sec': 33.69, 'step_peak_memory_allocated_MB': 34547.75, 'step_peak_memory_reserved_MB': 52132.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 52132.0, 'step_tokens_per_second': 11434.53, 'avg_tokens_per_second': 10819.45} | ||
{'loss': 0.9917, 'grad_norm': 9.391414642333984, 'learning_rate': 1.5000000000000007e-06, 'epoch': 0.02, 'num_input_tokens_seen': 260096, 'step': 7, 'step_time_sec': 3.03, 'avg_step_time_sec': 3.31, 'time_to_completion_sec': 9.94, 'estimated_total_time_sec': 33.12, 'step_peak_memory_allocated_MB': 34547.74, 'step_peak_memory_reserved_MB': 52132.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 52132.0, 'step_tokens_per_second': 11671.7, 'avg_tokens_per_second': 10949.25} | ||
{'loss': 0.9286, 'grad_norm': 7.622978687286377, 'learning_rate': 7.018666706430663e-07, 'epoch': 0.02, 'num_input_tokens_seen': 306176, 'step': 8, 'step_time_sec': 4.37, 'avg_step_time_sec': 3.46, 'time_to_completion_sec': 6.93, 'estimated_total_time_sec': 34.63, 'step_peak_memory_allocated_MB': 34547.78, 'step_peak_memory_reserved_MB': 52132.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 52132.0, 'step_tokens_per_second': 10545.82, 'avg_tokens_per_second': 10876.54} | ||
{'loss': 0.984, 'grad_norm': 7.1107611656188965, 'learning_rate': 1.8092213764227505e-07, 'epoch': 0.02, 'num_input_tokens_seen': 348672, 'step': 9, 'step_time_sec': 3.59, 'avg_step_time_sec': 3.48, 'time_to_completion_sec': 3.48, 'estimated_total_time_sec': 34.79, 'step_peak_memory_allocated_MB': 34547.77, 'step_peak_memory_reserved_MB': 52132.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 52132.0, 'step_tokens_per_second': 11832.11, 'avg_tokens_per_second': 10999.84} | ||
{'loss': 0.9725, 'grad_norm': 7.447627544403076, 'learning_rate': 0.0, 'epoch': 0.03, 'num_input_tokens_seen': 386560, 'step': 10, 'step_time_sec': 3.99, 'avg_step_time_sec': 3.54, 'time_to_completion_sec': 0.0, 'estimated_total_time_sec': 35.36, 'step_peak_memory_allocated_MB': 34547.87, 'step_peak_memory_reserved_MB': 52132.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 52132.0, 'step_tokens_per_second': 9499.24, 'avg_tokens_per_second': 10811.76} | ||
{'train_runtime': 38.6812, 'train_samples_per_second': 33.091, 'train_steps_per_second': 0.259, 'train_loss': 1.2382215678691864, 'epoch': 0.03, 'num_input_tokens_seen': 386560, 'step': 10, 'step_time_sec': 3.99, 'avg_step_time_sec': 3.54, 'time_to_completion_sec': 0.0, 'estimated_total_time_sec': 35.36, 'step_peak_memory_allocated_MB': 34547.87, 'step_peak_memory_reserved_MB': 52132.0, 'total_peak_memory_allocated_MB': 34547.91, 'total_peak_memory_reserved_MB': 52132.0, 'step_tokens_per_second': 9499.24, 'avg_tokens_per_second': 10811.76} |
15 changes: 15 additions & 0 deletions
15
examples/huggingface/results/mistral_use_liger_True_patching_type_post_init_instance.log
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
********** Post-Init Instance Patching *********** | ||
********** Post-Init Instance Patching *********** | ||
********** Post-Init Instance Patching *********** | ||
********** Post-Init Instance Patching *********** | ||
{'loss': 10.3753, 'grad_norm': 80.14120483398438, 'learning_rate': 6e-06, 'epoch': 0.0, 'num_input_tokens_seen': 47104} | ||
{'loss': 10.374, 'grad_norm': 80.64556121826172, 'learning_rate': 5.819077862357725e-06, 'epoch': 0.01, 'num_input_tokens_seen': 89600, 'step': 2, 'step_time_sec': 2.98, 'avg_step_time_sec': 2.98, 'time_to_completion_sec': 23.82, 'estimated_total_time_sec': 29.78, 'step_peak_memory_allocated_MB': 34547.82, 'step_peak_memory_reserved_MB': 45690.0, 'total_peak_memory_allocated_MB': 34547.82, 'total_peak_memory_reserved_MB': 45690.0, 'step_tokens_per_second': 14269.83, 'avg_tokens_per_second': 14269.83} | ||
{'loss': 9.5078, 'grad_norm': 30.093812942504883, 'learning_rate': 5.298133329356934e-06, 'epoch': 0.01, 'num_input_tokens_seen': 131072, 'step': 3, 'step_time_sec': 3.66, 'avg_step_time_sec': 3.32, 'time_to_completion_sec': 23.22, 'estimated_total_time_sec': 33.17, 'step_peak_memory_allocated_MB': 34547.88, 'step_peak_memory_reserved_MB': 48532.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 48532.0, 'step_tokens_per_second': 11344.44, 'avg_tokens_per_second': 12657.71} | ||
{'loss': 8.7012, 'grad_norm': 33.24259948730469, 'learning_rate': 4.5e-06, 'epoch': 0.01, 'num_input_tokens_seen': 176640, 'step': 4, 'step_time_sec': 3.94, 'avg_step_time_sec': 3.53, 'time_to_completion_sec': 21.15, 'estimated_total_time_sec': 35.25, 'step_peak_memory_allocated_MB': 34547.71, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 11559.63, 'avg_tokens_per_second': 12248.41} | ||
{'loss': 7.7453, 'grad_norm': 46.81055450439453, 'learning_rate': 3.5209445330007917e-06, 'epoch': 0.01, 'num_input_tokens_seen': 222208, 'step': 5, 'step_time_sec': 3.93, 'avg_step_time_sec': 3.63, 'time_to_completion_sec': 18.13, 'estimated_total_time_sec': 36.26, 'step_peak_memory_allocated_MB': 34547.76, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 11605.14, 'avg_tokens_per_second': 12074.24} | ||
{'loss': 7.1292, 'grad_norm': 25.51975440979004, 'learning_rate': 2.4790554669992093e-06, 'epoch': 0.02, 'num_input_tokens_seen': 260096, 'step': 6, 'step_time_sec': 2.79, 'avg_step_time_sec': 3.46, 'time_to_completion_sec': 13.84, 'estimated_total_time_sec': 34.59, 'step_peak_memory_allocated_MB': 34547.74, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 13556.53, 'avg_tokens_per_second': 12313.75} | ||
{'loss': 6.747, 'grad_norm': 17.432945251464844, 'learning_rate': 1.5000000000000007e-06, 'epoch': 0.02, 'num_input_tokens_seen': 303104, 'step': 7, 'step_time_sec': 3.66, 'avg_step_time_sec': 3.49, 'time_to_completion_sec': 10.48, 'estimated_total_time_sec': 34.92, 'step_peak_memory_allocated_MB': 34547.74, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 11759.21, 'avg_tokens_per_second': 12216.96} | ||
{'loss': 6.4769, 'grad_norm': 16.094770431518555, 'learning_rate': 7.018666706430663e-07, 'epoch': 0.02, 'num_input_tokens_seen': 345600, 'step': 8, 'step_time_sec': 3.94, 'avg_step_time_sec': 3.56, 'time_to_completion_sec': 7.11, 'estimated_total_time_sec': 35.56, 'step_peak_memory_allocated_MB': 34547.75, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 10792.2, 'avg_tokens_per_second': 11991.58} | ||
{'loss': 6.3711, 'grad_norm': 14.258646011352539, 'learning_rate': 1.8092213764227505e-07, 'epoch': 0.02, 'num_input_tokens_seen': 393216, 'step': 9, 'step_time_sec': 3.95, 'avg_step_time_sec': 3.61, 'time_to_completion_sec': 3.61, 'estimated_total_time_sec': 36.05, 'step_peak_memory_allocated_MB': 34547.79, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 12050.23, 'avg_tokens_per_second': 11999.61} | ||
{'loss': 6.236, 'grad_norm': 13.197342872619629, 'learning_rate': 0.0, 'epoch': 0.03, 'num_input_tokens_seen': 432128, 'step': 10, 'step_time_sec': 3.01, 'avg_step_time_sec': 3.54, 'time_to_completion_sec': 0.0, 'estimated_total_time_sec': 35.4, 'step_peak_memory_allocated_MB': 34547.7, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 12913.91, 'avg_tokens_per_second': 12086.09} | ||
{'train_runtime': 38.3024, 'train_samples_per_second': 33.418, 'train_steps_per_second': 0.261, 'train_loss': 7.966402006149292, 'epoch': 0.03, 'num_input_tokens_seen': 432128, 'step': 10, 'step_time_sec': 3.01, 'avg_step_time_sec': 3.54, 'time_to_completion_sec': 0.0, 'estimated_total_time_sec': 35.4, 'step_peak_memory_allocated_MB': 34547.7, 'step_peak_memory_reserved_MB': 49246.0, 'total_peak_memory_allocated_MB': 34547.88, 'total_peak_memory_reserved_MB': 49246.0, 'step_tokens_per_second': 12913.91, 'avg_tokens_per_second': 12086.09} |
Oops, something went wrong.