From 33e9da6c1ab7d1be5aca159d5c280df048928252 Mon Sep 17 00:00:00 2001 From: Ariel Faigon Date: Mon, 30 Jan 2017 09:36:09 -0800 Subject: [PATCH] - Shave ~1 sec from RunTests by speeding-up slowest test 16 (bfgs) by ~70% (#1181) (achieved by dropping '-b 20', pass 13 termination & output remain the same) - Trim trailing spaces from bfgs progress output (trailing space appeared when time field was removed from lines) - Trim trailing spaces from bfgs.cc source - Update bfgs stderr reference files --- test/RunTests | 2 +- test/train-sets/ref/frank.stderr | 16 +++++++------- test/train-sets/ref/rcv1_small.stderr | 30 +++++++++++++-------------- test/train-sets/ref/zero.stderr | 4 ++-- vowpalwabbit/bfgs.cc | 26 +++++++++++------------ 5 files changed, 39 insertions(+), 39 deletions(-) diff --git a/test/RunTests b/test/RunTests index 3be7c4bf0dc..d7167b750f2 100755 --- a/test/RunTests +++ b/test/RunTests @@ -897,7 +897,7 @@ __DATA__ train-sets/ref/zero.stderr # Test 16: LBFGS early termination -{VW} -k -c -d train-sets/rcv1_small.dat --loss_function=logistic -b 20 --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0 --holdout_off +{VW} -k -c -d train-sets/rcv1_small.dat --loss_function=logistic --bfgs --mem 7 --passes 20 --termination 0.001 --l2 1.0 --holdout_off train-sets/ref/rcv1_small.stdout train-sets/ref/rcv1_small.stderr diff --git a/test/train-sets/ref/frank.stderr b/test/train-sets/ref/frank.stderr index 8a9206fafbb..2e3095a7043 100644 --- a/test/train-sets/ref/frank.stderr +++ b/test/train-sets/ref/frank.stderr @@ -10,16 +10,16 @@ decay_learning_rate = 1 using l2 regularization m = 7 Allocated 72M for weights and mem -## avg. loss der. mag. d. m. cond. wolfe1 wolfe2 mix fraction curvature dir. magnitude step size time +## avg. loss der. mag. d. m. cond. wolfe1 wolfe2 mix fraction curvature dir. magnitude step size time 1 3.313292e+12 9.499654e+14 1.905397e+13 6.440241e+13 2.698544e+12 2.958580e-01 0.690 - 3 4.946559e+11 1.587411e+13 2.431760e+11 0.500000 0.000000 4.134247e+08 1.000000e+00 1.152 - 4 4.262257e+11 1.360521e+13 2.087762e+11 0.963277 0.926554 9.978023e+10 1.000000e+00 1.711 - 5 5.509095e+09 1.685125e+10 8.165054e+08 0.502727 0.005672 4.928468e+09 1.000000e+00 2.350 - 6 2.722915e+09 9.129735e+09 4.115941e+08 0.848424 0.697668 2.678213e+10 1.000000e+00 3.318 - 7 5.137694e+06 6.706857e+06 1.018132e+05 0.499525 -0.000686 2.755246e+05 1.000000e+00 4.523 + 3 4.946559e+11 1.587411e+13 2.431760e+11 0.500000 0.000000 4.134247e+08 1.000000e+00 1.152 + 4 4.262257e+11 1.360521e+13 2.087762e+11 0.963277 0.926554 9.978023e+10 1.000000e+00 1.711 + 5 5.509095e+09 1.685125e+10 8.165054e+08 0.502727 0.005672 4.928468e+09 1.000000e+00 2.350 + 6 2.722915e+09 9.129735e+09 4.115941e+08 0.848424 0.697668 2.678213e+10 1.000000e+00 3.318 + 7 5.137694e+06 6.706857e+06 1.018132e+05 0.499525 -0.000686 2.755246e+05 1.000000e+00 4.523 8 5.995715e+06 3.517040e+07 5.294065e+05 -0.644854 -2.289846 (revise x 0.5) 5.000000e-01 4.767 - 9 5.019540e+06 2.805249e+06 4.237483e+04 0.177600 -0.644881 1.362459e+04 1.000000e+00 6.061 -10 4.934764e+06 6.562551e+00 1.255939e-01 0.499927 -0.000122 7.431789e-03 1.000000e+00 7.549 + 9 5.019540e+06 2.805249e+06 4.237483e+04 0.177600 -0.644881 1.362459e+04 1.000000e+00 6.061 +10 4.934764e+06 6.562551e+00 1.255939e-01 0.499927 -0.000122 7.431789e-03 1.000000e+00 7.549 finished run diff --git a/test/train-sets/ref/rcv1_small.stderr b/test/train-sets/ref/rcv1_small.stderr index 96c5cff2bd4..0379f2d9873 100644 --- a/test/train-sets/ref/rcv1_small.stderr +++ b/test/train-sets/ref/rcv1_small.stderr @@ -1,28 +1,28 @@ using l2 regularization = 1 enabling BFGS based optimization **without** curvature calculation -Num weight bits = 20 +Num weight bits = 18 learning rate = 0.5 initial_t = 0 power_t = 0.5 decay_learning_rate = 1 m = 7 -Allocated 72M for weights and mem -## avg. loss der. mag. d. m. cond. wolfe1 wolfe2 mix fraction curvature dir. magnitude step size +Allocated 18M for weights and mem +## avg. loss der. mag. d. m. cond. wolfe1 wolfe2 mix fraction curvature dir. magnitude step size creating cache_file = train-sets/rcv1_small.dat.cache Reading datafile = train-sets/rcv1_small.dat num sources = 1 - 1 0.69315 0.00266 0.87764 2.24708 776.93237 0.39057 - 3 0.51357 0.00493 4.93046 0.523903 0.088793 76.25748 1.00000 - 4 0.65936 0.04915 49.15202 -0.910622 -2.480116 (revise x 0.5) 0.50000 - 5 0.51658 0.00876 8.76105 -0.037665 -0.999616 (revise x 0.5) 0.25000 - 6 0.49499 0.00028 0.28254 0.463963 -0.056952 0.51262 1.00000 - 7 0.49354 0.00006 0.05641 0.619867 0.244153 0.08545 1.00000 - 8 0.49287 0.00005 0.05434 0.870687 0.741762 0.91640 1.00000 - 9 0.48978 0.00014 0.13750 0.772760 0.546930 2.01229 1.00000 -10 0.48472 0.00027 0.27437 0.750340 0.501776 3.21399 1.00000 -11 0.47920 0.00017 0.16867 0.671044 0.340515 1.40135 1.00000 -12 0.47707 0.00001 0.00760 0.593376 0.181239 0.09201 1.00000 -13 0.47691 0.00000 0.00168 0.593289 0.185020 0.00955 1.00000 + 1 0.69315 0.00266 0.87764 2.24708 776.93237 0.39057 + 3 0.51357 0.00493 4.93046 0.523903 0.088793 76.25748 1.00000 + 4 0.65936 0.04915 49.15202 -0.910622 -2.480116 (revise x 0.5) 0.50000 + 5 0.51658 0.00876 8.76105 -0.037665 -0.999616 (revise x 0.5) 0.25000 + 6 0.49499 0.00028 0.28254 0.463963 -0.056952 0.51262 1.00000 + 7 0.49354 0.00006 0.05641 0.619867 0.244153 0.08545 1.00000 + 8 0.49287 0.00005 0.05434 0.870687 0.741762 0.91640 1.00000 + 9 0.48978 0.00014 0.13750 0.772760 0.546930 2.01229 1.00000 +10 0.48472 0.00027 0.27437 0.750340 0.501776 3.21399 1.00000 +11 0.47920 0.00017 0.16867 0.671044 0.340515 1.40135 1.00000 +12 0.47707 0.00001 0.00760 0.593376 0.181239 0.09201 1.00000 +13 0.47691 0.00000 0.00168 0.593278 0.185019 0.00955 1.00000 finished run number of examples = 13000 diff --git a/test/train-sets/ref/zero.stderr b/test/train-sets/ref/zero.stderr index afcf80f11dc..6559d0669f7 100644 --- a/test/train-sets/ref/zero.stderr +++ b/test/train-sets/ref/zero.stderr @@ -7,11 +7,11 @@ power_t = 0.5 decay_learning_rate = 1 m = 7 Allocated 72M for weights and mem -## avg. loss der. mag. d. m. cond. wolfe1 wolfe2 mix fraction curvature dir. magnitude step size +## avg. loss der. mag. d. m. cond. wolfe1 wolfe2 mix fraction curvature dir. magnitude step size creating cache_file = train-sets/zero.dat.cache Reading datafile = train-sets/zero.dat num sources = 1 - 1 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 + 1 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 finished run number of examples = 10 diff --git a/vowpalwabbit/bfgs.cc b/vowpalwabbit/bfgs.cc index 85d5d7d117d..076b392dbf1 100644 --- a/vowpalwabbit/bfgs.cc +++ b/vowpalwabbit/bfgs.cc @@ -341,7 +341,7 @@ void bfgs_iter_middle(vw& all, bfgs& b, float* mem, double* rho, double* alpha, { coef_j = alpha[j] - rho[j] * y_r; y_r = 0.; - for (typename T::iterator w = weights.begin(); w != weights.end(); ++w) + for (typename T::iterator w = weights.begin(); w != weights.end(); ++w) { mem = mem0 + (w.index() >> weights.stride_shift()) * b.mem_stride; (&(*w))[W_DIR] += (float)coef_j*mem[(2 * j + MEM_ST + origin) % b.mem_stride]; @@ -421,7 +421,7 @@ double wolfe_eval(vw& all, bfgs& b, float* mem, double loss_sum, double previous template double add_regularization(vw& all, bfgs& b, float regularization, T& weights) { //compute the derivative difference double ret = 0.; - + if (b.regularizers == nullptr) for (typename T::iterator w = weights.begin(); w != weights.end(); ++w) { @@ -451,7 +451,7 @@ template double add_regularization(vw& all, bfgs& b, float regularizat ret -= 0.5*b.regularizers[2*i]*delta_weight*delta_weight; } } - + return ret; } @@ -575,7 +575,7 @@ double derivative_in_direction(vw& all, bfgs& b, float* mem, int &origin, T& wei { double ret = 0.; for (typename T::iterator w = weights.begin(); w != weights.end(); ++w) - { + { float* mem1 = mem + (w.index() >> weights.stride_shift()) * b.mem_stride; ret += mem1[(MEM_GT + origin) % b.mem_stride] * (&(*w))[W_DIR]; } @@ -614,14 +614,14 @@ int process_pass(vw& all, bfgs& b) /********************************************************************/ /* A) FIRST PASS FINISHED: INITIALIZE FIRST LINE SEARCH *************/ /********************************************************************/ - if (b.first_pass) + if (b.first_pass) { if(all.all_reduce != nullptr) { accumulate(all, all.weights, W_COND); //Accumulate preconditioner float temp = (float)b.importance_weight_sum; b.importance_weight_sum = accumulate_scalar(all, temp); } //finalize_preconditioner(all, b, all.l2_lambda); - if(all.all_reduce != nullptr) + if(all.all_reduce != nullptr) { float temp = (float)b.loss_sum; b.loss_sum = accumulate_scalar(all, temp); //Accumulate loss_sums accumulate(all, all.weights, 1); //Accumulate gradients from all nodes @@ -645,7 +645,7 @@ int process_pass(vw& all, bfgs& b) ftime(&b.t_end_global); b.net_time = (int) (1000.0 * (b.t_end_global.time - b.t_start_global.time) + (b.t_end_global.millitm - b.t_start_global.millitm)); if (!all.quiet) - fprintf(stderr, "%-10s\t%-10.5f\t%-10.5f\n", "", d_mag, b.step_size); + fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size); b.predictions.erase(); update_weight(all, b.step_size); } @@ -695,7 +695,7 @@ int process_pass(vw& all, bfgs& b) b.net_time = (int) (1000.0 * (b.t_end_global.time - b.t_start_global.time) + (b.t_end_global.millitm - b.t_start_global.millitm)); float ratio = (b.step_size==0.f) ? 0.f : (float)new_step/(float)b.step_size; if (!all.quiet) - fprintf(stderr, "%-10s\t%-10s\t(revise x %.1f)\t%-10.5f\n", + fprintf(stderr, "%-10s\t%-10s\t(revise x %.1f)\t%-.5f\n", "","",ratio, new_step); b.predictions.erase(); @@ -739,7 +739,7 @@ int process_pass(vw& all, bfgs& b) ftime(&b.t_end_global); b.net_time = (int) (1000.0 * (b.t_end_global.time - b.t_start_global.time) + (b.t_end_global.millitm - b.t_start_global.millitm)); if (!all.quiet) - fprintf(stderr, "%-10s\t%-10.5f\t%-10.5f\n", "", d_mag, b.step_size); + fprintf(stderr, "%-10s\t%-10.5f\t%-.5f\n", "", d_mag, b.step_size); b.predictions.erase(); update_weight(all, b.step_size); } @@ -750,7 +750,7 @@ int process_pass(vw& all, bfgs& b) /* C) NOT FIRST PASS, CURVATURE CALCULATED **************************/ /********************************************************************/ else // just finished all second gradients - { + { if(all.all_reduce != nullptr) { float t = (float)b.curvature; b.curvature = accumulate_scalar(all, t); //Accumulate curvatures @@ -779,7 +779,7 @@ int process_pass(vw& all, bfgs& b) b.net_time = (int) (1000.0 * (b.t_end_global.time - b.t_start_global.time) + (b.t_end_global.millitm - b.t_start_global.millitm)); if (!all.quiet) - fprintf(stderr, "%-10.5f\t%-10.5f\t%-10.5f\n", b.curvature / b.importance_weight_sum, d_mag, b.step_size); + fprintf(stderr, "%-10.5f\t%-10.5f\t%-.5f\n", b.curvature / b.importance_weight_sum, d_mag, b.step_size); b.gradient_pass = true; }//now start computing derivatives. b.current_pass++; @@ -944,7 +944,7 @@ void save_load_regularizer(vw& all, bfgs& b, io_buf& model_file, bool read, bool i++; } while ((!read && i < length) || (read && brw >0)); - + if (read) regularizer_to_weight(all, b); } @@ -979,7 +979,7 @@ void save_load(bfgs& b, io_buf& model_file, bool read, bool text) ftime(&b.t_start_global); if (!all->quiet) - { const char * header_fmt = "%2s %-10s\t%-10s\t%-10s\t %-10s\t%-10s\t%-10s\t%-10s\t%-10s\t%-10s\n"; + { const char * header_fmt = "%2s %-10s\t%-10s\t%-10s\t %-10s\t%-10s\t%-10s\t%-10s\t%-10s\t%-s\n"; fprintf(stderr, header_fmt, "##", "avg. loss", "der. mag.", "d. m. cond.", "wolfe1", "wolfe2", "mix fraction", "curvature", "dir. magnitude", "step size"); cerr.precision(5);