diff --git a/python/pylibvw.cc b/python/pylibvw.cc index 18d11b85645..d142e97816f 100644 --- a/python/pylibvw.cc +++ b/python/pylibvw.cc @@ -162,7 +162,7 @@ example_ptr my_empty_example(vw_ptr vw, size_t labelType) return boost::shared_ptr(ec, my_delete_example); } -example_ptr my_read_example(vw_ptr all, size_t labelType, char*str) +example_ptr my_read_example(vw_ptr all, size_t labelType, char* str) { example*ec = my_empty_example0(all, labelType); VW::read_line(*all, ec, str); VW::setup_example(*all, ec); @@ -170,6 +170,12 @@ example_ptr my_read_example(vw_ptr all, size_t labelType, char*str) return boost::shared_ptr(ec, my_delete_example); } +example_ptr my_existing_example(vw_ptr all, size_t labelType, example_ptr existing_example) +{ + existing_example->example_counter = labelType; + return boost::shared_ptr(existing_example); +} + void my_finish_example(vw_ptr all, example_ptr ec) { // TODO } @@ -183,19 +189,16 @@ void my_learn(vw_ptr all, example_ptr ec) } } -float my_learn_string(vw_ptr all, char*str) -{ example*ec = VW::read_example(*all, str); - all->learn(*ec); - float pp = ec->partial_prediction; - VW::finish_example(*all, *ec); - return pp; -} - float my_predict(vw_ptr all, example_ptr ec) { as_singleline(all->l)->predict(*ec); return ec->partial_prediction; } +bool my_is_multiline(vw_ptr all) +{ + return all->l->is_multiline; +} + template void predict_or_learn(vw_ptr& all, py::list& ec) { multi_ex ex_coll; @@ -211,20 +214,29 @@ void predict_or_learn(vw_ptr& all, py::list& ec) else as_multiline(all->l)->predict(ex_coll); } +py::list my_parse(vw_ptr& all, char* str) +{ + v_array examples = v_init(); + examples.push_back(&VW::get_unused_example(all.get())); + all->p->text_reader(all.get(), str, strlen(str), examples); + + py::list example_collection; + for (auto ex : examples) + { + VW::setup_example(*all, ex); + example_collection.append(ex); + } + examples.clear(); + examples.delete_v(); + return example_collection; +} + void my_learn_multi_ex(vw_ptr& all, py::list& ec) { predict_or_learn(all, ec); } void my_predict_multi_ex(vw_ptr& all, py::list& ec) { predict_or_learn(all, ec); } -float my_predict_string(vw_ptr all, char*str) -{ example*ec = VW::read_example(*all, str); - as_singleline(all->l)->predict(*ec); - float pp = ec->partial_prediction; - VW::finish_example(*all, *ec); - return pp; -} - string varray_char_to_string(v_array &a) { string ret = ""; for (auto c : a) @@ -699,9 +711,7 @@ BOOST_PYTHON_MODULE(pylibvw) .def("finish", &my_finish, "stop VW by calling finish (and, eg, write weights to disk)") .def("save", &my_save, "save model to filename") .def("learn", &my_learn, "given a pyvw example, learn (and predict) on that example") - .def("learn_string", &my_learn_string, "given an example specified as a string (as in a VW data file), learn on that example") .def("predict", &my_predict, "given a pyvw example, predict on that example") - .def("predict_string", &my_predict_string, "given an example specified as a string (as in a VW data file), predict on that example") .def("hash_space", &VW::hash_space, "given a namespace (as a string), compute the hash of that namespace") .def("hash_feature", &VW::hash_feature, "given a feature string (arg2) and a hashed namespace (arg3), hash that feature") .def("finish_example", &my_finish_example, "tell VW that you're done with a given example") @@ -725,6 +735,8 @@ BOOST_PYTHON_MODULE(pylibvw) .def("learn_multi", &my_learn_multi_ex, "given a list pyvw examples, learn (and predict) on those examples") .def("predict_multi", &my_predict_multi_ex, "given a list of pyvw examples, predict on that example") + .def("_parse", &my_parse, "Parse a string into a collection of VW examples") + .def("_is_multiline", &my_is_multiline, "true if the base reduction is multiline") .def_readonly("lDefault", lDEFAULT, "Default label type (whatever vw was initialized with) -- used as input to the example() initializer") .def_readonly("lBinary", lBINARY, "Binary label type -- used as input to the example() initializer") @@ -746,6 +758,7 @@ BOOST_PYTHON_MODULE(pylibvw) py::class_("example", py::no_init) .def("__init__", py::make_constructor(my_read_example), "Given a string as an argument parse that into a VW example (and run setup on it) -- default to multiclass label type") .def("__init__", py::make_constructor(my_empty_example), "Construct an empty (non setup) example; you must provide a label type (vw.lBinary, vw.lMulticlass, etc.)") + .def("__init__", py::make_constructor(my_existing_example), "Create a new example object pointing to an existing object.") .def("set_test_only", &my_set_test_only, "Change the test-only bit on an example") diff --git a/python/vowpalwabbit/pyvw.py b/python/vowpalwabbit/pyvw.py index b2b6fb347e1..f3bd9c75aec 100644 --- a/python/vowpalwabbit/pyvw.py +++ b/python/vowpalwabbit/pyvw.py @@ -113,6 +113,19 @@ def format_input(key, val): self.finished = False + def parse(self, str_ex, labelType=pylibvw.vw.lDefault): + """Returns a collection of examples for a multiline example learner or a single + example for a single example learner.""" + str_ex = str_ex.replace('\r', '') + ec = self._parse(str_ex) + ec = [example(self, x, labelType) for x in ec] + if not self._is_multiline(): + if len(ec) == 1: + ec = ec[0] + else: + raise TypeError('expecting single line example, got multi_ex of len %i' % len(ec)) + return ec + def num_weights(self): """Get length of weight vector.""" return pylibvw.vw.num_weights(self) @@ -126,9 +139,13 @@ def learn(self, ec): """Perform an online update; ec can either be an example object or a string (in which case it is parsed and then learned on) or list which is iterated over.""" + # If a string was given, parse it before passing to learner. + new_example = False if isinstance(ec, str): - self.learn_string(ec) - elif isinstance(ec, example): + ec = self.parse(ec) + new_example = True + + if isinstance(ec, example): if hasattr(ec, 'setup_done') and not ec.setup_done: ec.setup_example() pylibvw.vw.learn(self, ec) @@ -137,6 +154,12 @@ def learn(self, ec): else: raise TypeError('expecting string or example object as ec argument for learn, got %s' % type(ec)) + if new_example: + if isinstance(ec, list): + map(lambda x: x.finish(), ec) + else: + ec.finish() + def predict(self, ec, prediction_type=None): """Just make a prediction on this example; ec can either be an example object or a string (in which case it is parsed and then predicted on). @@ -145,11 +168,16 @@ def predict(self, ec, prediction_type=None): otherwise the the learner's prediction type will determine the output.""" new_example = False - if isinstance(ec, (str, dict)): + if isinstance(ec, dict): ec = self.example(ec) ec.setup_done = True new_example = True + # If a string was given, parse it before passing to learner. + if isinstance(ec, str): + ec = self.parse(ec) + new_example = True + if not isinstance(ec, example) and not isinstance(ec, list): raise TypeError('expecting string, example object, or list of example objects as ec argument for predict, got %s' % type(ec)) @@ -170,7 +198,10 @@ def predict(self, ec, prediction_type=None): prediction = get_prediction(ec[0], prediction_type) if new_example: - ec.finish() + if isinstance(ec, list): + map(lambda x: x.finish(), ec) + else: + ec.finish() return prediction @@ -559,6 +590,15 @@ def __init__(self, vw, initStringOrDict=None, labelType=pylibvw.vw.lDefault): self.finished = False self.labelType = labelType + def __init__(self, vw, raw_example, labelType=pylibvw.vw.lDefault): + """Wrap existing raw example object""" + + pylibvw.example.__init__(self, vw, labelType, raw_example) + self.vw = vw + self.stride = vw.get_stride() + self.finished = False + self.labelType = labelType + def __del__(self): self.finish() diff --git a/vowpalwabbit/parse_example.cc b/vowpalwabbit/parse_example.cc index 436cf281ae4..d4c2b64d1a3 100644 --- a/vowpalwabbit/parse_example.cc +++ b/vowpalwabbit/parse_example.cc @@ -457,6 +457,21 @@ void substring_to_example(vw* all, example* ae, substring example) TC_parser parser_line(bar_location, example.end, *all, ae); } +std::vector split(char* phrase, std::string delimiter){ + std::vector list; + std::string s = std::string(phrase); + size_t pos = 0; + std::string token; + while ((pos = s.find(delimiter)) != std::string::npos) { + token = s.substr(0, pos); + list.push_back(token); + s.erase(0, pos + delimiter.length()); + } + list.push_back(s); + return list; +} + + namespace VW { void read_line(vw& all, example* ex, char* line) @@ -465,4 +480,20 @@ void read_line(vw& all, example* ex, char* line) while ((ss.end >= ss.begin) && (*(ss.end - 1) == '\n')) ss.end--; substring_to_example(&all, ex, ss); } + +void read_lines(vw* all, char* line, size_t /*len*/, v_array& examples) +{ + auto lines = split(line, "\n"); + for(size_t i = 0; i < lines.size(); i++) + { + // Check if a new empty example needs to be added. + if(examples.size() < i + 1) + { + examples.push_back(&VW::get_unused_example(all)); + } + read_line(*all, examples[i], const_cast(lines[i].c_str())); + } +} + + } // namespace VW diff --git a/vowpalwabbit/parse_example.h b/vowpalwabbit/parse_example.h index af43a917207..f61b3f7fba4 100644 --- a/vowpalwabbit/parse_example.h +++ b/vowpalwabbit/parse_example.h @@ -22,6 +22,8 @@ namespace VW { example& get_unused_example(vw* all); void read_line(vw& all, example* ex, char* line); // read example from the line. +void read_lines(vw* all, char* line, size_t len, v_array& examples); // read examples from the new line separated strings. + } // namespace VW int read_features_string(vw* all, v_array& examples); diff --git a/vowpalwabbit/parse_example_json.h b/vowpalwabbit/parse_example_json.h index 36a1f5987ff..ee5ed3d037c 100644 --- a/vowpalwabbit/parse_example_json.h +++ b/vowpalwabbit/parse_example_json.h @@ -1176,56 +1176,45 @@ void read_line_decision_service_json(vw& all, v_array& examples, char* } // namespace VW template -int read_features_json(vw* all, v_array& examples) +void line_to_examples_json(vw* all, char* line, size_t num_chars, v_array& examples) { - bool reread; - do + if (all->p->decision_service_json) { - reread = false; - - char* line; - size_t num_chars; - size_t num_chars_initial = read_features(all, line, num_chars); - if (num_chars_initial < 1) - return (int)num_chars_initial; - - line[num_chars] = '\0'; - if (all->p->decision_service_json) + // Skip lines that do not start with "{" + if (line[0] != '{') { - // Skip lines that do not start with "{" - if (line[0] != '{') - { - reread = true; - continue; - } - - DecisionServiceInteraction interaction; - VW::template read_line_decision_service_json(*all, examples, line, num_chars, false, - reinterpret_cast(&VW::get_unused_example), all, &interaction); + return; + } - if (interaction.skipLearn) - { - VW::return_multiple_example(*all, examples); - examples.push_back(&VW::get_unused_example(all)); - reread = true; - } + DecisionServiceInteraction interaction; + VW::template read_line_decision_service_json(*all, examples, line, num_chars, false, + reinterpret_cast(&VW::get_unused_example), all, &interaction); - // let's continue reading data until we find a line with actions provided - if (interaction.actions.size() == 0) - reread = true; + if (interaction.skipLearn) + { + VW::return_multiple_example(*all, examples); + examples.push_back(&VW::get_unused_example(all)); + return; } - else - VW::template read_line_json( - *all, examples, line, reinterpret_cast(&VW::get_unused_example), all); - } while (reread); + + // let's continue reading data until we find a line with actions provided + if (interaction.actions.size() == 0) + VW::return_multiple_example(*all, examples); + examples.push_back(&VW::get_unused_example(all)); + return; + } + else + VW::template read_line_json( + *all, examples, line, reinterpret_cast(&VW::get_unused_example), all); // note: the json parser does single pass parsing and cannot determine if a shared example is needed. // since the communication between the parsing thread the main learner expects examples to be requested in order (as // they're layed out in memory) there is no way to determine upfront if a shared example exists thus even if there are // no features for the shared example, still an empty example is returned. + // insert new line example at the end if (examples.size() > 1) - { // insert new line example at the end + { example& ae = VW::get_unused_example(all); char empty = '\0'; substring example = {&empty, &empty}; @@ -1233,6 +1222,25 @@ int read_features_json(vw* all, v_array& examples) examples.push_back(&ae); } +} + +template +int read_features_json(vw* all, v_array& examples) +{ + // Keep reading lines until a valid set of examples is produced. + do + { + char* line; + size_t num_chars; + size_t num_chars_initial = read_features(all, line, num_chars); + if (num_chars_initial < 1) + return (int)num_chars_initial; + + // Ensure there is a null terminator. + line[num_chars] = '\0'; + + line_to_examples_json(all, line, num_chars, examples); + } while (examples.size() == 0); return 1; } diff --git a/vowpalwabbit/parser.cc b/vowpalwabbit/parser.cc index 7a14db3c94d..7558dbdecad 100644 --- a/vowpalwabbit/parser.cc +++ b/vowpalwabbit/parser.cc @@ -539,12 +539,14 @@ void enable_sources(vw& all, bool quiet, size_t passes, input_options& input_opt if (all.audit || all.hash_inv) { all.p->reader = &read_features_json; + all.p->text_reader = &line_to_examples_json; all.p->audit = true; all.p->jsonp = std::make_shared>(); } else { all.p->reader = &read_features_json; + all.p->text_reader = &line_to_examples_json; all.p->audit = false; all.p->jsonp = std::make_shared>(); } @@ -552,7 +554,10 @@ void enable_sources(vw& all, bool quiet, size_t passes, input_options& input_opt all.p->decision_service_json = input_options.dsjson; } else + { all.p->reader = read_features_string; + all.p->text_reader = VW::read_lines; + } all.p->resettable = all.p->write_cache; } diff --git a/vowpalwabbit/parser.h b/vowpalwabbit/parser.h index 5ef89f57603..a3ff844f1da 100644 --- a/vowpalwabbit/parser.h +++ b/vowpalwabbit/parser.h @@ -67,6 +67,8 @@ struct parser io_buf* input = nullptr; // Input source(s) int (*reader)(vw*, v_array& examples); + void (*text_reader)(vw*, char*, size_t, v_array&); + hash_func_t hasher; bool resettable; // Whether or not the input can be reset. io_buf* output = nullptr; // Where to output the cache.