-
Notifications
You must be signed in to change notification settings - Fork 1.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Warm start for cbify #1534
Warm start for cbify #1534
Changes from 139 commits
c891ae8
cc0ac23
4a27941
539b1e4
961a5a5
0fbc26a
369b3ea
8f096a5
904134f
e271344
8879525
ced4bbd
ac71d8d
9debba8
e295aff
ed2f2bf
0da506a
122c8a3
c01f8cc
0d4d633
ded8f53
68d8600
aace037
41127f8
94c8103
0a25495
46d91c0
fad3955
1351a31
f921051
e7384bb
630fd5f
a7d5360
601a756
f2f9bb6
8298ec6
543bab9
c9beeb0
2343af4
32d33ba
f1355b7
caac66e
9a4eef5
b30d987
6735a02
024d9cc
bd5fe57
e3cda02
3f64541
87f9afa
2e10698
4b54dc0
5e993af
24c79e8
502d593
b06e454
1922659
f6539b5
b471ddb
6259c67
c330450
6fa0031
7240acb
621b392
d1fbfd7
6bddc96
f529db0
e84c7d9
7e6b889
6f7fc00
1a5b3e0
9e431ed
d157858
bc94f6c
1bab4c3
0f6e8db
a32c2e7
4ab1d8c
5d7dc31
a4fb02f
f8d14ab
67ffd89
7b6e2ba
8fce742
0f3b946
cf3b488
e9ec432
b60e872
df4267d
a350e0a
f7f1366
9d6a364
a47aba8
b4d87e0
19c161c
405622d
e2502f6
ed980e3
36b174a
4c3eed3
8117896
e12a8da
4e639bc
6540308
be93a25
5fa45cb
2020c50
8a51d16
24648e2
f95d154
558f1a2
648f0d9
5561a12
6069739
d9573e1
01bf93e
83da642
3f03785
452e4aa
aa9e9f7
52439aa
2f146e7
e5db844
4f12bb6
2011b7a
24f970c
4d8811d
7bc56af
4971477
db1da5e
12b36b9
1c0400b
0e3b7bc
63d8c40
99d642b
217ee32
6cf41b6
3ad0f7b
2fa610e
5775bd6
753d885
5e923d8
35d9ab0
c71d3e3
94c7147
a7408f0
3df666e
13bf77c
cca8449
5776849
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Num weight bits = 18 | ||
learning rate = 0.5 | ||
initial_t = 0 | ||
power_t = 0.5 | ||
using no cache | ||
Reading datafile = train-sets/multiclass | ||
num sources = 1 | ||
average since example example current current current | ||
loss last counter weight label predict features | ||
1.000000 1.000000 4 1.0 4 3 2 | ||
1.000000 1.000000 5 2.0 5 3 2 | ||
1.000000 1.000000 7 4.0 7 3 2 | ||
|
||
finished run | ||
number of examples = 10 | ||
weighted example sum = 7.000000 | ||
weighted label sum = 0.000000 | ||
average loss = 1.000000 | ||
total feature number = 20 | ||
average variance estimate = 171.578140 | ||
theoretical average variance = 200.000000 | ||
last lambda chosen = 0.031250 among lambdas ranging from 0.031250 to 0.937500 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
Num weight bits = 18 | ||
learning rate = 0.5 | ||
initial_t = 0 | ||
power_t = 0.5 | ||
using no cache | ||
Reading datafile = train-sets/cs_cb | ||
num sources = 1 | ||
average since example example current current current | ||
loss last counter weight label predict features | ||
0.000000 0.000000 2 1.0 known 2 4 | ||
0.000000 0.000000 3 2.0 known 2 4 | ||
|
||
finished run | ||
number of examples = 3 | ||
weighted example sum = 2.000000 | ||
weighted label sum = 0.000000 | ||
average loss = 0.000000 | ||
total feature number = 12 | ||
average variance estimate = 1.034483 | ||
theoretical average variance = 60.000000 | ||
last lambda chosen = 0.031250 among lambdas ranging from 0.031250 to 0.937500 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Num weight bits = 18 | ||
learning rate = 0.5 | ||
initial_t = 0 | ||
power_t = 0.5 | ||
using no cache | ||
Reading datafile = train-sets/multiclass | ||
num sources = 1 | ||
average since example example current current current | ||
loss last counter weight label predict features | ||
0.000000 0.000000 4 1.0 4 4 2 | ||
0.500000 1.000000 5 2.0 5 4 2 | ||
0.750000 1.000000 7 4.0 7 3 2 | ||
|
||
finished run | ||
number of examples = 10 | ||
weighted example sum = 7.000000 | ||
weighted label sum = 0.000000 | ||
average loss = 0.857143 | ||
total feature number = 20 | ||
average variance estimate = 143.156311 | ||
theoretical average variance = 200.000000 | ||
last lambda chosen = 0.937500 among lambdas ranging from 0.031250 to 0.937500 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Num weight bits = 18 | ||
learning rate = 0.5 | ||
initial_t = 0 | ||
power_t = 0.5 | ||
using no cache | ||
Reading datafile = train-sets/multiclass | ||
num sources = 1 | ||
average since example example current current current | ||
loss last counter weight label predict features | ||
1.000000 1.000000 4 1.0 4 3 2 | ||
1.000000 1.000000 5 2.0 5 9 2 | ||
0.750000 0.500000 7 4.0 7 7 2 | ||
|
||
finished run | ||
number of examples = 10 | ||
weighted example sum = 7.000000 | ||
weighted label sum = 0.000000 | ||
average loss = 0.857143 | ||
total feature number = 20 | ||
average variance estimate = 60.903835 | ||
theoretical average variance = 200.000000 | ||
last lambda chosen = 0.000000 among lambdas ranging from 0.000000 to 1.000000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Num weight bits = 18 | ||
learning rate = 0.5 | ||
initial_t = 0 | ||
power_t = 0.5 | ||
using no cache | ||
Reading datafile = train-sets/multiclass | ||
num sources = 1 | ||
average since example example current current current | ||
loss last counter weight label predict features | ||
1.000000 1.000000 4 1.0 4 3 2 | ||
1.000000 1.000000 5 2.0 5 3 2 | ||
1.000000 1.000000 7 4.0 7 3 2 | ||
|
||
finished run | ||
number of examples = 10 | ||
weighted example sum = 7.000000 | ||
weighted label sum = 0.000000 | ||
average loss = 1.000000 | ||
total feature number = 20 | ||
average variance estimate = 1.000000 | ||
theoretical average variance = inf | ||
last lambda chosen = 0.000000 among lambdas ranging from 0.000000 to 0.000000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Num weight bits = 18 | ||
learning rate = 0.5 | ||
initial_t = 0 | ||
power_t = 0.5 | ||
using no cache | ||
Reading datafile = train-sets/multiclass | ||
num sources = 1 | ||
average since example example current current current | ||
loss last counter weight label predict features | ||
1.000000 1.000000 4 1.0 4 5 2 | ||
1.000000 1.000000 5 2.0 5 9 2 | ||
0.750000 0.500000 7 4.0 7 7 2 | ||
|
||
finished run | ||
number of examples = 10 | ||
weighted example sum = 7.000000 | ||
weighted label sum = 0.000000 | ||
average loss = 0.714286 | ||
total feature number = 20 | ||
average variance estimate = 7.512840 | ||
theoretical average variance = 200.000000 | ||
last lambda chosen = 1.000000 among lambdas ranging from 1.000000 to 1.000000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
Num weight bits = 18 | ||
learning rate = 0.5 | ||
initial_t = 0 | ||
power_t = 0.5 | ||
using no cache | ||
Reading datafile = train-sets/multiclass | ||
num sources = 1 | ||
average since example example current current current | ||
loss last counter weight label predict features | ||
1.000000 1.000000 4 1.0 4 7 2 | ||
1.000000 1.000000 5 2.0 5 1 2 | ||
0.750000 0.500000 7 4.0 7 10 2 | ||
|
||
finished run | ||
number of examples = 10 | ||
weighted example sum = 7.000000 | ||
weighted label sum = 0.000000 | ||
average loss = 0.857143 | ||
total feature number = 20 | ||
average variance estimate = 4.685901 | ||
theoretical average variance = 200.000000 | ||
last lambda chosen = 0.500000 among lambdas ranging from 0.500000 to 0.500000 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -269,7 +269,9 @@ void predict_or_learn_first(cb_explore_adf& data, multi_learner& base, multi_ex& | |
template <bool is_learn> | ||
void predict_or_learn_greedy(cb_explore_adf& data, multi_learner& base, multi_ex& examples) | ||
{ | ||
// Explore uniform random an epsilon fraction of the time. | ||
data.offset = examples[0]->ft_offset; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we set something which is never used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe we need this additional line - we had an email discussion on this on July 7, 2018: If we don't have this line, in warm_cb.cc, base.learn(data.adf_data.ecs, 0) / base.learn(data.adf_data.ecs, 1)'s updates seem to be on the same set of weights. |
||
//Explore uniform random an epsilon fraction of the time. | ||
|
||
if (is_learn && test_adf_sequence(examples) != nullptr) | ||
multiline_learn_or_predict<true>(base, examples, data.offset); | ||
else | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I am a little confused about this line - what is ((float)mydata.gen_cs.event_sum / (float)mydata.gen_cs.action_sum)? Is it 1/K?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's more like 1 / average K. These are defined via this: https://github.com/VowpalWabbit/vowpal_wabbit/blob/master/vowpalwabbit/gen_cs_example.cc#L171