Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable JSON example parsing in Python bindings #1809

Merged
merged 6 commits into from
Mar 29, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 32 additions & 19 deletions python/pylibvw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,20 @@ example_ptr my_empty_example(vw_ptr vw, size_t labelType)
return boost::shared_ptr<example>(ec, my_delete_example);
}

example_ptr my_read_example(vw_ptr all, size_t labelType, char*str)
example_ptr my_read_example(vw_ptr all, size_t labelType, char* str)
{ example*ec = my_empty_example0(all, labelType);
VW::read_line(*all, ec, str);
VW::setup_example(*all, ec);
ec->example_counter = labelType;
return boost::shared_ptr<example>(ec, my_delete_example);
}

example_ptr my_existing_example(vw_ptr all, size_t labelType, example_ptr existing_example)
{
existing_example->example_counter = labelType;
return boost::shared_ptr<example>(existing_example);
}

void my_finish_example(vw_ptr all, example_ptr ec)
{ // TODO
jackgerrits marked this conversation as resolved.
Show resolved Hide resolved
}
Expand All @@ -183,19 +189,16 @@ void my_learn(vw_ptr all, example_ptr ec)
}
}

float my_learn_string(vw_ptr all, char*str)
{ example*ec = VW::read_example(*all, str);
all->learn(*ec);
float pp = ec->partial_prediction;
VW::finish_example(*all, *ec);
return pp;
}

float my_predict(vw_ptr all, example_ptr ec)
{ as_singleline(all->l)->predict(*ec);
return ec->partial_prediction;
}

bool my_is_multiline(vw_ptr all)
{
return all->l->is_multiline;
}

template<bool learn>
void predict_or_learn(vw_ptr& all, py::list& ec)
{ multi_ex ex_coll;
Expand All @@ -211,20 +214,29 @@ void predict_or_learn(vw_ptr& all, py::list& ec)
else as_multiline(all->l)->predict(ex_coll);
}

py::list my_parse(vw_ptr& all, char* str)
{
v_array<example*> examples = v_init<example*>();
examples.push_back(&VW::get_unused_example(all.get()));
all->p->text_reader(all.get(), str, strlen(str), examples);

py::list example_collection;
for (auto ex : examples)
{
VW::setup_example(*all, ex);
example_collection.append(ex);
}
examples.clear();
examples.delete_v();
return example_collection;
}

void my_learn_multi_ex(vw_ptr& all, py::list& ec)
{ predict_or_learn<true>(all, ec); }

void my_predict_multi_ex(vw_ptr& all, py::list& ec)
{ predict_or_learn<false>(all, ec); }

float my_predict_string(vw_ptr all, char*str)
{ example*ec = VW::read_example(*all, str);
as_singleline(all->l)->predict(*ec);
float pp = ec->partial_prediction;
VW::finish_example(*all, *ec);
return pp;
}

string varray_char_to_string(v_array<char> &a)
{ string ret = "";
for (auto c : a)
Expand Down Expand Up @@ -699,9 +711,7 @@ BOOST_PYTHON_MODULE(pylibvw)
.def("finish", &my_finish, "stop VW by calling finish (and, eg, write weights to disk)")
.def("save", &my_save, "save model to filename")
.def("learn", &my_learn, "given a pyvw example, learn (and predict) on that example")
.def("learn_string", &my_learn_string, "given an example specified as a string (as in a VW data file), learn on that example")
.def("predict", &my_predict, "given a pyvw example, predict on that example")
.def("predict_string", &my_predict_string, "given an example specified as a string (as in a VW data file), predict on that example")
.def("hash_space", &VW::hash_space, "given a namespace (as a string), compute the hash of that namespace")
.def("hash_feature", &VW::hash_feature, "given a feature string (arg2) and a hashed namespace (arg3), hash that feature")
.def("finish_example", &my_finish_example, "tell VW that you're done with a given example")
Expand All @@ -725,6 +735,8 @@ BOOST_PYTHON_MODULE(pylibvw)

.def("learn_multi", &my_learn_multi_ex, "given a list pyvw examples, learn (and predict) on those examples")
.def("predict_multi", &my_predict_multi_ex, "given a list of pyvw examples, predict on that example")
.def("_parse", &my_parse, "Parse a string into a collection of VW examples")
.def("_is_multiline", &my_is_multiline, "true if the base reduction is multiline")

.def_readonly("lDefault", lDEFAULT, "Default label type (whatever vw was initialized with) -- used as input to the example() initializer")
.def_readonly("lBinary", lBINARY, "Binary label type -- used as input to the example() initializer")
Expand All @@ -746,6 +758,7 @@ BOOST_PYTHON_MODULE(pylibvw)
py::class_<example, example_ptr>("example", py::no_init)
.def("__init__", py::make_constructor(my_read_example), "Given a string as an argument parse that into a VW example (and run setup on it) -- default to multiclass label type")
.def("__init__", py::make_constructor(my_empty_example), "Construct an empty (non setup) example; you must provide a label type (vw.lBinary, vw.lMulticlass, etc.)")
.def("__init__", py::make_constructor(my_existing_example), "Create a new example object pointing to an existing object.")

.def("set_test_only", &my_set_test_only, "Change the test-only bit on an example")

Expand Down
48 changes: 44 additions & 4 deletions python/vowpalwabbit/pyvw.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,19 @@ def format_input(key, val):

self.finished = False

def parse(self, str_ex, labelType=pylibvw.vw.lDefault):
"""Returns a collection of examples for a multiline example learner or a single
example for a single example learner."""
str_ex = str_ex.replace('\r', '')
ec = self._parse(str_ex)
ec = [example(self, x, labelType) for x in ec]
if not self._is_multiline():
if len(ec) == 1:
ec = ec[0]
else:
raise TypeError('expecting single line example, got multi_ex of len %i' % len(ec))
return ec

def num_weights(self):
"""Get length of weight vector."""
return pylibvw.vw.num_weights(self)
Expand All @@ -126,9 +139,13 @@ def learn(self, ec):
"""Perform an online update; ec can either be an example
object or a string (in which case it is parsed and then
learned on) or list which is iterated over."""
# If a string was given, parse it before passing to learner.
new_example = False
if isinstance(ec, str):
self.learn_string(ec)
elif isinstance(ec, example):
ec = self.parse(ec)
new_example = True

if isinstance(ec, example):
if hasattr(ec, 'setup_done') and not ec.setup_done:
ec.setup_example()
pylibvw.vw.learn(self, ec)
Expand All @@ -137,6 +154,12 @@ def learn(self, ec):
else:
raise TypeError('expecting string or example object as ec argument for learn, got %s' % type(ec))

if new_example:
if isinstance(ec, list):
map(lambda x: x.finish(), ec)
else:
ec.finish()

def predict(self, ec, prediction_type=None):
"""Just make a prediction on this example; ec can either be an example
object or a string (in which case it is parsed and then predicted on).
Expand All @@ -145,11 +168,16 @@ def predict(self, ec, prediction_type=None):
otherwise the the learner's prediction type will determine the output."""

new_example = False
if isinstance(ec, (str, dict)):
if isinstance(ec, dict):
ec = self.example(ec)
ec.setup_done = True
new_example = True

# If a string was given, parse it before passing to learner.
if isinstance(ec, str):
ec = self.parse(ec)
new_example = True

if not isinstance(ec, example) and not isinstance(ec, list):
raise TypeError('expecting string, example object, or list of example objects as ec argument for predict, got %s' % type(ec))

Expand All @@ -170,7 +198,10 @@ def predict(self, ec, prediction_type=None):
prediction = get_prediction(ec[0], prediction_type)

if new_example:
ec.finish()
if isinstance(ec, list):
map(lambda x: x.finish(), ec)
else:
ec.finish()

return prediction

Expand Down Expand Up @@ -559,6 +590,15 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
self.finished = False
self.labelType = labelType

def __init__(self, vw, raw_example, labelType=pylibvw.vw.lDefault):
"""Wrap existing raw example object"""

pylibvw.example.__init__(self, vw, labelType, raw_example)
self.vw = vw
self.stride = vw.get_stride()
self.finished = False
self.labelType = labelType

def __del__(self):
self.finish()

Expand Down
31 changes: 31 additions & 0 deletions vowpalwabbit/parse_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,21 @@ void substring_to_example(vw* all, example* ae, substring example)
TC_parser<false> parser_line(bar_location, example.end, *all, ae);
}

std::vector<std::string> split(char* phrase, std::string delimiter){
std::vector<std::string> list;
std::string s = std::string(phrase);
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
list.push_back(token);
s.erase(0, pos + delimiter.length());
}
list.push_back(s);
return list;
}


namespace VW
{
void read_line(vw& all, example* ex, char* line)
Expand All @@ -465,4 +480,20 @@ void read_line(vw& all, example* ex, char* line)
while ((ss.end >= ss.begin) && (*(ss.end - 1) == '\n')) ss.end--;
substring_to_example(&all, ex, ss);
}

void read_lines(vw* all, char* line, size_t /*len*/, v_array<example*>& examples)
{
auto lines = split(line, "\n");
jackgerrits marked this conversation as resolved.
Show resolved Hide resolved
for(size_t i = 0; i < lines.size(); i++)
{
// Check if a new empty example needs to be added.
if(examples.size() < i + 1)
{
examples.push_back(&VW::get_unused_example(all));
}
read_line(*all, examples[i], const_cast<char*>(lines[i].c_str()));
}
}


} // namespace VW
2 changes: 2 additions & 0 deletions vowpalwabbit/parse_example.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ namespace VW
{
example& get_unused_example(vw* all);
void read_line(vw& all, example* ex, char* line); // read example from the line.
void read_lines(vw* all, char* line, size_t len, v_array<example*>& examples); // read examples from the new line separated strings.

} // namespace VW

int read_features_string(vw* all, v_array<example*>& examples);
Expand Down
82 changes: 45 additions & 37 deletions vowpalwabbit/parse_example_json.h
Original file line number Diff line number Diff line change
Expand Up @@ -1176,63 +1176,71 @@ void read_line_decision_service_json(vw& all, v_array<example*>& examples, char*
} // namespace VW

template <bool audit>
int read_features_json(vw* all, v_array<example*>& examples)
void line_to_examples_json(vw* all, char* line, size_t num_chars, v_array<example*>& examples)
{
bool reread;
do
if (all->p->decision_service_json)
{
reread = false;

char* line;
size_t num_chars;
size_t num_chars_initial = read_features(all, line, num_chars);
if (num_chars_initial < 1)
return (int)num_chars_initial;

line[num_chars] = '\0';
if (all->p->decision_service_json)
// Skip lines that do not start with "{"
if (line[0] != '{')
{
// Skip lines that do not start with "{"
if (line[0] != '{')
{
reread = true;
continue;
}

DecisionServiceInteraction interaction;
VW::template read_line_decision_service_json<audit>(*all, examples, line, num_chars, false,
reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all, &interaction);
return;
}

if (interaction.skipLearn)
{
VW::return_multiple_example(*all, examples);
examples.push_back(&VW::get_unused_example(all));
reread = true;
}
DecisionServiceInteraction interaction;
VW::template read_line_decision_service_json<audit>(*all, examples, line, num_chars, false,
reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all, &interaction);

// let's continue reading data until we find a line with actions provided
if (interaction.actions.size() == 0)
reread = true;
if (interaction.skipLearn)
{
VW::return_multiple_example(*all, examples);
examples.push_back(&VW::get_unused_example(all));
return;
}
else
VW::template read_line_json<audit>(
*all, examples, line, reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all);
} while (reread);

// let's continue reading data until we find a line with actions provided
if (interaction.actions.size() == 0)
VW::return_multiple_example(*all, examples);
examples.push_back(&VW::get_unused_example(all));
return;
}
else
VW::template read_line_json<audit>(
*all, examples, line, reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all);

// note: the json parser does single pass parsing and cannot determine if a shared example is needed.
// since the communication between the parsing thread the main learner expects examples to be requested in order (as
// they're layed out in memory) there is no way to determine upfront if a shared example exists thus even if there are
// no features for the shared example, still an empty example is returned.

// insert new line example at the end
if (examples.size() > 1)
{ // insert new line example at the end
{
example& ae = VW::get_unused_example(all);
char empty = '\0';
substring example = {&empty, &empty};
substring_to_example(all, &ae, example);

examples.push_back(&ae);
}
}

template <bool audit>
int read_features_json(vw* all, v_array<example*>& examples)
{
// Keep reading lines until a valid set of examples is produced.
do
{
char* line;
size_t num_chars;
size_t num_chars_initial = read_features(all, line, num_chars);
if (num_chars_initial < 1)
return (int)num_chars_initial;

// Ensure there is a null terminator.
line[num_chars] = '\0';

line_to_examples_json<audit>(all, line, num_chars, examples);
} while (examples.size() == 0);

return 1;
}
5 changes: 5 additions & 0 deletions vowpalwabbit/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -539,20 +539,25 @@ void enable_sources(vw& all, bool quiet, size_t passes, input_options& input_opt
if (all.audit || all.hash_inv)
{
all.p->reader = &read_features_json<true>;
all.p->text_reader = &line_to_examples_json<true>;
all.p->audit = true;
all.p->jsonp = std::make_shared<json_parser<true>>();
}
else
{
all.p->reader = &read_features_json<false>;
all.p->text_reader = &line_to_examples_json<false>;
all.p->audit = false;
all.p->jsonp = std::make_shared<json_parser<false>>();
}

all.p->decision_service_json = input_options.dsjson;
}
else
{
all.p->reader = read_features_string;
all.p->text_reader = VW::read_lines;
}

all.p->resettable = all.p->write_cache;
}
Expand Down
Loading