Skip to content

Commit

Permalink
Enable JSON example parsing in Python bindings (VowpalWabbit#1809)
Browse files Browse the repository at this point in the history
* Add text_reader, enable json parsing in python bindings

* Strip \r, raise error on len > 1 single examples
  • Loading branch information
jackgerrits committed May 15, 2019
1 parent ae3d6a4 commit 1c6bec8
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 60 deletions.
51 changes: 32 additions & 19 deletions python/pylibvw.cc
Original file line number Diff line number Diff line change
Expand Up @@ -162,14 +162,20 @@ example_ptr my_empty_example(vw_ptr vw, size_t labelType)
return boost::shared_ptr<example>(ec, my_delete_example);
}

example_ptr my_read_example(vw_ptr all, size_t labelType, char*str)
example_ptr my_read_example(vw_ptr all, size_t labelType, char* str)
{ example*ec = my_empty_example0(all, labelType);
VW::read_line(*all, ec, str);
VW::setup_example(*all, ec);
ec->example_counter = labelType;
return boost::shared_ptr<example>(ec, my_delete_example);
}

example_ptr my_existing_example(vw_ptr all, size_t labelType, example_ptr existing_example)
{
existing_example->example_counter = labelType;
return boost::shared_ptr<example>(existing_example);
}

void my_finish_example(vw_ptr all, example_ptr ec)
{ // TODO
}
Expand All @@ -183,19 +189,16 @@ void my_learn(vw_ptr all, example_ptr ec)
}
}

float my_learn_string(vw_ptr all, char*str)
{ example*ec = VW::read_example(*all, str);
all->learn(*ec);
float pp = ec->partial_prediction;
VW::finish_example(*all, *ec);
return pp;
}

float my_predict(vw_ptr all, example_ptr ec)
{ as_singleline(all->l)->predict(*ec);
return ec->partial_prediction;
}

bool my_is_multiline(vw_ptr all)
{
return all->l->is_multiline;
}

template<bool learn>
void predict_or_learn(vw_ptr& all, py::list& ec)
{ multi_ex ex_coll;
Expand All @@ -211,20 +214,29 @@ void predict_or_learn(vw_ptr& all, py::list& ec)
else as_multiline(all->l)->predict(ex_coll);
}

py::list my_parse(vw_ptr& all, char* str)
{
v_array<example*> examples = v_init<example*>();
examples.push_back(&VW::get_unused_example(all.get()));
all->p->text_reader(all.get(), str, strlen(str), examples);

py::list example_collection;
for (auto ex : examples)
{
VW::setup_example(*all, ex);
example_collection.append(ex);
}
examples.clear();
examples.delete_v();
return example_collection;
}

void my_learn_multi_ex(vw_ptr& all, py::list& ec)
{ predict_or_learn<true>(all, ec); }

void my_predict_multi_ex(vw_ptr& all, py::list& ec)
{ predict_or_learn<false>(all, ec); }

float my_predict_string(vw_ptr all, char*str)
{ example*ec = VW::read_example(*all, str);
as_singleline(all->l)->predict(*ec);
float pp = ec->partial_prediction;
VW::finish_example(*all, *ec);
return pp;
}

string varray_char_to_string(v_array<char> &a)
{ string ret = "";
for (auto c : a)
Expand Down Expand Up @@ -699,9 +711,7 @@ BOOST_PYTHON_MODULE(pylibvw)
.def("finish", &my_finish, "stop VW by calling finish (and, eg, write weights to disk)")
.def("save", &my_save, "save model to filename")
.def("learn", &my_learn, "given a pyvw example, learn (and predict) on that example")
.def("learn_string", &my_learn_string, "given an example specified as a string (as in a VW data file), learn on that example")
.def("predict", &my_predict, "given a pyvw example, predict on that example")
.def("predict_string", &my_predict_string, "given an example specified as a string (as in a VW data file), predict on that example")
.def("hash_space", &VW::hash_space, "given a namespace (as a string), compute the hash of that namespace")
.def("hash_feature", &VW::hash_feature, "given a feature string (arg2) and a hashed namespace (arg3), hash that feature")
.def("finish_example", &my_finish_example, "tell VW that you're done with a given example")
Expand All @@ -725,6 +735,8 @@ BOOST_PYTHON_MODULE(pylibvw)

.def("learn_multi", &my_learn_multi_ex, "given a list pyvw examples, learn (and predict) on those examples")
.def("predict_multi", &my_predict_multi_ex, "given a list of pyvw examples, predict on that example")
.def("_parse", &my_parse, "Parse a string into a collection of VW examples")
.def("_is_multiline", &my_is_multiline, "true if the base reduction is multiline")

.def_readonly("lDefault", lDEFAULT, "Default label type (whatever vw was initialized with) -- used as input to the example() initializer")
.def_readonly("lBinary", lBINARY, "Binary label type -- used as input to the example() initializer")
Expand All @@ -746,6 +758,7 @@ BOOST_PYTHON_MODULE(pylibvw)
py::class_<example, example_ptr>("example", py::no_init)
.def("__init__", py::make_constructor(my_read_example), "Given a string as an argument parse that into a VW example (and run setup on it) -- default to multiclass label type")
.def("__init__", py::make_constructor(my_empty_example), "Construct an empty (non setup) example; you must provide a label type (vw.lBinary, vw.lMulticlass, etc.)")
.def("__init__", py::make_constructor(my_existing_example), "Create a new example object pointing to an existing object.")

.def("set_test_only", &my_set_test_only, "Change the test-only bit on an example")

Expand Down
48 changes: 44 additions & 4 deletions python/vowpalwabbit/pyvw.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,19 @@ def format_input(key, val):

self.finished = False

def parse(self, str_ex, labelType=pylibvw.vw.lDefault):
"""Returns a collection of examples for a multiline example learner or a single
example for a single example learner."""
str_ex = str_ex.replace('\r', '')
ec = self._parse(str_ex)
ec = [example(self, x, labelType) for x in ec]
if not self._is_multiline():
if len(ec) == 1:
ec = ec[0]
else:
raise TypeError('expecting single line example, got multi_ex of len %i' % len(ec))
return ec

def num_weights(self):
"""Get length of weight vector."""
return pylibvw.vw.num_weights(self)
Expand All @@ -126,9 +139,13 @@ def learn(self, ec):
"""Perform an online update; ec can either be an example
object or a string (in which case it is parsed and then
learned on) or list which is iterated over."""
# If a string was given, parse it before passing to learner.
new_example = False
if isinstance(ec, str):
self.learn_string(ec)
elif isinstance(ec, example):
ec = self.parse(ec)
new_example = True

if isinstance(ec, example):
if hasattr(ec, 'setup_done') and not ec.setup_done:
ec.setup_example()
pylibvw.vw.learn(self, ec)
Expand All @@ -137,6 +154,12 @@ def learn(self, ec):
else:
raise TypeError('expecting string or example object as ec argument for learn, got %s' % type(ec))

if new_example:
if isinstance(ec, list):
map(lambda x: x.finish(), ec)
else:
ec.finish()

def predict(self, ec, prediction_type=None):
"""Just make a prediction on this example; ec can either be an example
object or a string (in which case it is parsed and then predicted on).
Expand All @@ -145,11 +168,16 @@ def predict(self, ec, prediction_type=None):
otherwise the the learner's prediction type will determine the output."""

new_example = False
if isinstance(ec, (str, dict)):
if isinstance(ec, dict):
ec = self.example(ec)
ec.setup_done = True
new_example = True

# If a string was given, parse it before passing to learner.
if isinstance(ec, str):
ec = self.parse(ec)
new_example = True

if not isinstance(ec, example) and not isinstance(ec, list):
raise TypeError('expecting string, example object, or list of example objects as ec argument for predict, got %s' % type(ec))

Expand All @@ -170,7 +198,10 @@ def predict(self, ec, prediction_type=None):
prediction = get_prediction(ec[0], prediction_type)

if new_example:
ec.finish()
if isinstance(ec, list):
map(lambda x: x.finish(), ec)
else:
ec.finish()

return prediction

Expand Down Expand Up @@ -559,6 +590,15 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault):
self.finished = False
self.labelType = labelType

def __init__(self, vw, raw_example, labelType=pylibvw.vw.lDefault):
"""Wrap existing raw example object"""

pylibvw.example.__init__(self, vw, labelType, raw_example)
self.vw = vw
self.stride = vw.get_stride()
self.finished = False
self.labelType = labelType

def __del__(self):
self.finish()

Expand Down
31 changes: 31 additions & 0 deletions vowpalwabbit/parse_example.cc
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,21 @@ void substring_to_example(vw* all, example* ae, substring example)
TC_parser<false> parser_line(bar_location, example.end, *all, ae);
}

std::vector<std::string> split(char* phrase, std::string delimiter){
std::vector<std::string> list;
std::string s = std::string(phrase);
size_t pos = 0;
std::string token;
while ((pos = s.find(delimiter)) != std::string::npos) {
token = s.substr(0, pos);
list.push_back(token);
s.erase(0, pos + delimiter.length());
}
list.push_back(s);
return list;
}


namespace VW
{
void read_line(vw& all, example* ex, char* line)
Expand All @@ -465,4 +480,20 @@ void read_line(vw& all, example* ex, char* line)
while ((ss.end >= ss.begin) && (*(ss.end - 1) == '\n')) ss.end--;
substring_to_example(&all, ex, ss);
}

void read_lines(vw* all, char* line, size_t /*len*/, v_array<example*>& examples)
{
auto lines = split(line, "\n");
for(size_t i = 0; i < lines.size(); i++)
{
// Check if a new empty example needs to be added.
if(examples.size() < i + 1)
{
examples.push_back(&VW::get_unused_example(all));
}
read_line(*all, examples[i], const_cast<char*>(lines[i].c_str()));
}
}


} // namespace VW
2 changes: 2 additions & 0 deletions vowpalwabbit/parse_example.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ namespace VW
{
example& get_unused_example(vw* all);
void read_line(vw& all, example* ex, char* line); // read example from the line.
void read_lines(vw* all, char* line, size_t len, v_array<example*>& examples); // read examples from the new line separated strings.

} // namespace VW

int read_features_string(vw* all, v_array<example*>& examples);
Expand Down
82 changes: 45 additions & 37 deletions vowpalwabbit/parse_example_json.h
Original file line number Diff line number Diff line change
Expand Up @@ -1176,63 +1176,71 @@ void read_line_decision_service_json(vw& all, v_array<example*>& examples, char*
} // namespace VW

template <bool audit>
int read_features_json(vw* all, v_array<example*>& examples)
void line_to_examples_json(vw* all, char* line, size_t num_chars, v_array<example*>& examples)
{
bool reread;
do
if (all->p->decision_service_json)
{
reread = false;

char* line;
size_t num_chars;
size_t num_chars_initial = read_features(all, line, num_chars);
if (num_chars_initial < 1)
return (int)num_chars_initial;

line[num_chars] = '\0';
if (all->p->decision_service_json)
// Skip lines that do not start with "{"
if (line[0] != '{')
{
// Skip lines that do not start with "{"
if (line[0] != '{')
{
reread = true;
continue;
}

DecisionServiceInteraction interaction;
VW::template read_line_decision_service_json<audit>(*all, examples, line, num_chars, false,
reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all, &interaction);
return;
}

if (interaction.skipLearn)
{
VW::return_multiple_example(*all, examples);
examples.push_back(&VW::get_unused_example(all));
reread = true;
}
DecisionServiceInteraction interaction;
VW::template read_line_decision_service_json<audit>(*all, examples, line, num_chars, false,
reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all, &interaction);

// let's continue reading data until we find a line with actions provided
if (interaction.actions.size() == 0)
reread = true;
if (interaction.skipLearn)
{
VW::return_multiple_example(*all, examples);
examples.push_back(&VW::get_unused_example(all));
return;
}
else
VW::template read_line_json<audit>(
*all, examples, line, reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all);
} while (reread);

// let's continue reading data until we find a line with actions provided
if (interaction.actions.size() == 0)
VW::return_multiple_example(*all, examples);
examples.push_back(&VW::get_unused_example(all));
return;
}
else
VW::template read_line_json<audit>(
*all, examples, line, reinterpret_cast<VW::example_factory_t>(&VW::get_unused_example), all);

// note: the json parser does single pass parsing and cannot determine if a shared example is needed.
// since the communication between the parsing thread the main learner expects examples to be requested in order (as
// they're layed out in memory) there is no way to determine upfront if a shared example exists thus even if there are
// no features for the shared example, still an empty example is returned.

// insert new line example at the end
if (examples.size() > 1)
{ // insert new line example at the end
{
example& ae = VW::get_unused_example(all);
char empty = '\0';
substring example = {&empty, &empty};
substring_to_example(all, &ae, example);

examples.push_back(&ae);
}
}

template <bool audit>
int read_features_json(vw* all, v_array<example*>& examples)
{
// Keep reading lines until a valid set of examples is produced.
do
{
char* line;
size_t num_chars;
size_t num_chars_initial = read_features(all, line, num_chars);
if (num_chars_initial < 1)
return (int)num_chars_initial;

// Ensure there is a null terminator.
line[num_chars] = '\0';

line_to_examples_json<audit>(all, line, num_chars, examples);
} while (examples.size() == 0);

return 1;
}
5 changes: 5 additions & 0 deletions vowpalwabbit/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -583,20 +583,25 @@ void enable_sources(vw& all, bool quiet, size_t passes, input_options& input_opt
if (all.audit || all.hash_inv)
{
all.p->reader = &read_features_json<true>;
all.p->text_reader = &line_to_examples_json<true>;
all.p->audit = true;
all.p->jsonp = std::make_shared<json_parser<true>>();
}
else
{
all.p->reader = &read_features_json<false>;
all.p->text_reader = &line_to_examples_json<false>;
all.p->audit = false;
all.p->jsonp = std::make_shared<json_parser<false>>();
}

all.p->decision_service_json = input_options.dsjson;
}
else
{
all.p->reader = read_features_string;
all.p->text_reader = VW::read_lines;
}

all.p->resettable = all.p->write_cache;
}
Expand Down
Loading

0 comments on commit 1c6bec8

Please sign in to comment.