diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index a1ede3c9b..e00a87d8d 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -209,7 +209,7 @@ jobs: run: | if [[ $OS == ubuntu* ]]; then sudo ./dependencies.sh - python3 -m pip install setuptools + python3 -m pip install setuptools build else ./dependencies.sh fi @@ -238,7 +238,9 @@ jobs: with: repository: restream/reindexer-py - name: Install PyReindexer - run: sudo python3 setup.py install + run: | + python -m build + python -m pip install . - name: Test PyReindexer run: | cd pyreindexer diff --git a/cjson/decoder.go b/cjson/decoder.go index 317942572..ed6aba6a8 100644 --- a/cjson/decoder.go +++ b/cjson/decoder.go @@ -36,8 +36,7 @@ func fieldByTag(t reflect.Type, tag string) (result reflect.StructField, ok bool } for i := 0; i < t.NumField(); i++ { result = t.Field(i) - if ftag := result.Tag.Get("json"); len(ftag) > 0 { - ftag, _ = splitStr(ftag, ',') + if ftag, _ := splitStr(result.Tag.Get("json"), ','); len(ftag) > 0 { if tag == ftag { return result, true } diff --git a/clang-tidy/.clang-tidy b/clang-tidy/.clang-tidy deleted file mode 100644 index 063df74f1..000000000 --- a/clang-tidy/.clang-tidy +++ /dev/null @@ -1,31 +0,0 @@ -Checks: 'clang-diagnostic-*, - clang-analyzer-*, - performance-*, - bugprone-*, - -bugprone-exception-escape, - -bugprone-branch-clone, - -bugprone-easily-swappable-parameters, - -bugprone-macro-parentheses, - -bugprone-signed-char-misuse, - -bugprone-narrowing-conversions, - -bugprone-reserved-identifier, - -bugprone-implicit-widening-of-multiplication-result, - -bugprone-assignment-in-if-condition, - -bugprone-parent-virtual-call, - -bugprone-integer-division, - -bugprone-unhandled-self-assignment, - -bugprone-inc-dec-in-conditions, - -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling, - -performance-no-int-to-ptr, - -performance-enum-size, - -performance-avoid-endl' -# clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling - too many unnecessary warning in vendored code -# performance-no-int-to-ptr - consider how to fix this -# bugprone-macro-parentheses - consider fixing -WarningsAsErrors: '*' -HeaderFilterRegex: '.*(?= 4.0.0 are given under - # the top level key 'Diagnostics' in the output yaml files - mergekey = "Diagnostics" - merged=[] - for replacefile in glob.iglob(os.path.join(tmpdir, '*.yaml')): - content = yaml.safe_load(open(replacefile, 'r')) - if not content: - continue # Skip empty files. - merged.extend(content.get(mergekey, [])) - - if merged: - # MainSourceFile: The key is required by the definition inside - # include/clang/Tooling/ReplacementsYaml.h, but the value - # is actually never used inside clang-apply-replacements, - # so we set it to '' here. - output = {'MainSourceFile': '', mergekey: merged} - with open(mergefile, 'w') as out: - yaml.safe_dump(output, out) - else: - # Empty the file: - open(mergefile, 'w').close() - - -def find_binary(arg, name, build_path): - """Get the path for a binary or exit""" - if arg: - if shutil.which(arg): - return arg - else: - raise SystemExit( - "error: passed binary '{}' was not found or is not executable" - .format(arg)) - - built_path = os.path.join(build_path, "bin", name) - binary = shutil.which(name) or shutil.which(built_path) - if binary: - return binary - else: - raise SystemExit( - "error: failed to find {} in $PATH or at {}" - .format(name, built_path)) - - -def apply_fixes(args, clang_apply_replacements_binary, tmpdir): - """Calls clang-apply-fixes on a given directory.""" - invocation = [clang_apply_replacements_binary] - invocation.append('-ignore-insert-conflict') - if args.format: - invocation.append('-format') - if args.style: - invocation.append('-style=' + args.style) - invocation.append(tmpdir) - subprocess.call(invocation) - - -def run_tidy(args, clang_tidy_binary, tmpdir, build_path, queue, lock, - failed_files): - """Takes filenames out of queue and runs clang-tidy on them.""" - while True: - name = queue.get() - invocation = get_tidy_invocation(name, clang_tidy_binary, args.checks, - tmpdir, build_path, args.header_filter, - args.allow_enabling_alpha_checkers, - args.extra_arg, args.extra_arg_before, - args.quiet, args.config_file, args.config, - args.line_filter, args.use_color, - args.plugins) - - proc = subprocess.Popen(invocation, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - output, err = proc.communicate() - if proc.returncode != 0: - if proc.returncode < 0: - msg = "%s: terminated by signal %d\n" % (name, -proc.returncode) - err += msg.encode('utf-8') - failed_files.append(name) - with lock: - sys.stdout.write(' '.join(invocation) + '\n' + output.decode('utf-8')) - if len(err) > 0: - sys.stdout.flush() - sys.stderr.write(err.decode('utf-8')) - queue.task_done() - - -def main(): - parser = argparse.ArgumentParser(description='Runs clang-tidy over all files ' - 'in a compilation database. Requires ' - 'clang-tidy and clang-apply-replacements in ' - '$PATH or in your build directory.') - parser.add_argument('-allow-enabling-alpha-checkers', - action='store_true', help='allow alpha checkers from ' - 'clang-analyzer.') - parser.add_argument('-clang-tidy-binary', metavar='PATH', - default='clang-tidy-18', - help='path to clang-tidy binary') - parser.add_argument('-clang-apply-replacements-binary', metavar='PATH', - default='clang-apply-replacements-18', - help='path to clang-apply-replacements binary') - parser.add_argument('-checks', default=None, - help='checks filter, when not specified, use clang-tidy ' - 'default') - config_group = parser.add_mutually_exclusive_group() - config_group.add_argument('-config', default=None, - help='Specifies a configuration in YAML/JSON format: ' - ' -config="{Checks: \'*\', ' - ' CheckOptions: {x: y}}" ' - 'When the value is empty, clang-tidy will ' - 'attempt to find a file named .clang-tidy for ' - 'each source file in its parent directories.') - config_group.add_argument('-config-file', default=None, - help='Specify the path of .clang-tidy or custom config ' - 'file: e.g. -config-file=/some/path/myTidyConfigFile. ' - 'This option internally works exactly the same way as ' - '-config option after reading specified config file. ' - 'Use either -config-file or -config, not both.') - parser.add_argument('-header-filter', default=None, - help='regular expression matching the names of the ' - 'headers to output diagnostics from. Diagnostics from ' - 'the main file of each translation unit are always ' - 'displayed.') - parser.add_argument('-line-filter', default=None, - help='List of files with line ranges to filter the' - 'warnings.') - if yaml: - parser.add_argument('-export-fixes', metavar='filename', dest='export_fixes', - help='Create a yaml file to store suggested fixes in, ' - 'which can be applied with clang-apply-replacements.') - parser.add_argument('-j', type=int, default=0, - help='number of tidy instances to be run in parallel.') - parser.add_argument('files', nargs='*', default=['.*'], - help='files to be processed (regex on path)') - parser.add_argument('-fix', action='store_true', help='apply fix-its') - parser.add_argument('-format', action='store_true', help='Reformat code ' - 'after applying fixes') - parser.add_argument('-style', default='file', help='The style of reformat ' - 'code after applying fixes') - parser.add_argument('-use-color', type=strtobool, nargs='?', const=True, - help='Use colors in diagnostics, overriding clang-tidy\'s' - ' default behavior. This option overrides the \'UseColor' - '\' option in .clang-tidy file, if any.') - parser.add_argument('-p', dest='build_path', - help='Path used to read a compile command database.') - parser.add_argument('-extra-arg', dest='extra_arg', - action='append', default=[], - help='Additional argument to append to the compiler ' - 'command line.') - parser.add_argument('-extra-arg-before', dest='extra_arg_before', - action='append', default=[], - help='Additional argument to prepend to the compiler ' - 'command line.') - parser.add_argument('-ignore', default=DEFAULT_CLANG_TIDY_IGNORE, - help='File path to clang-tidy-ignore') - parser.add_argument('-quiet', action='store_true', - help='Run clang-tidy in quiet mode') - parser.add_argument('-load', dest='plugins', - action='append', default=[], - help='Load the specified plugin in clang-tidy.') - args = parser.parse_args() - - db_path = 'compile_commands.json' - - if args.build_path is not None: - build_path = args.build_path - else: - # Find our database - build_path = find_compilation_database(db_path) - - clang_tidy_binary = find_binary(args.clang_tidy_binary, "clang-tidy", - build_path) - - tmpdir = None - if args.fix or (yaml and args.export_fixes): - clang_apply_replacements_binary = find_binary( - args.clang_apply_replacements_binary, "clang-apply-replacements", - build_path) - tmpdir = tempfile.mkdtemp() - - try: - invocation = get_tidy_invocation("", clang_tidy_binary, args.checks, - None, build_path, args.header_filter, - args.allow_enabling_alpha_checkers, - args.extra_arg, args.extra_arg_before, - args.quiet, args.config_file, args.config, - args.line_filter, args.use_color, - args.plugins) - invocation.append('-list-checks') - invocation.append('-') - if args.quiet: - # Even with -quiet we still want to check if we can call clang-tidy. - with open(os.devnull, 'w') as dev_null: - subprocess.check_call(invocation, stdout=dev_null) - else: - subprocess.check_call(invocation) - except: - print("Unable to run clang-tidy.", file=sys.stderr) - sys.exit(1) - - # Load the database and extract all files. - database = json.load(open(os.path.join(build_path, db_path))) - files = set([make_absolute(entry['file'], entry['directory']) - for entry in database]) - files, excluded = filter_files(args.ignore, files) - if excluded: - print("Excluding the following files:\n" + "\n".join(excluded) + "\n") - - max_task = args.j - if max_task == 0: - max_task = multiprocessing.cpu_count() - - # Build up a big regexy filter from all command line arguments. - file_name_re = re.compile('|'.join(args.files)) - - return_code = 0 - try: - # Spin up a bunch of tidy-launching threads. - task_queue = queue.Queue(max_task) - # List of files with a non-zero return code. - failed_files = [] - lock = threading.Lock() - for _ in range(max_task): - t = threading.Thread(target=run_tidy, - args=(args, clang_tidy_binary, tmpdir, build_path, - task_queue, lock, failed_files)) - t.daemon = True - t.start() - - # Fill the queue with files. - for name in files: - if file_name_re.search(name): - task_queue.put(name) - - # Wait for all threads to be done. - task_queue.join() - if len(failed_files): - return_code = 1 - - except KeyboardInterrupt: - # This is a sad hack. Unfortunately subprocess goes - # bonkers with ctrl-c and we start forking merrily. - print('\nCtrl-C detected, goodbye.') - if tmpdir: - shutil.rmtree(tmpdir) - os.kill(0, 9) - - if yaml and args.export_fixes: - print('Writing fixes to ' + args.export_fixes + ' ...') - try: - merge_replacement_files(tmpdir, args.export_fixes) - except: - print('Error exporting fixes.\n', file=sys.stderr) - traceback.print_exc() - return_code=1 - - if args.fix: - print('Applying fixes ...') - try: - apply_fixes(args, clang_apply_replacements_binary, tmpdir) - except: - print('Error applying fixes.\n', file=sys.stderr) - traceback.print_exc() - return_code = 1 - - if tmpdir: - shutil.rmtree(tmpdir) - sys.exit(return_code) - - -if __name__ == '__main__': - main() diff --git a/cpp_src/cmd/reindexer_server/test/test_storage_compatibility.sh b/cpp_src/cmd/reindexer_server/test/test_storage_compatibility.sh new file mode 100755 index 000000000..873181e20 --- /dev/null +++ b/cpp_src/cmd/reindexer_server/test/test_storage_compatibility.sh @@ -0,0 +1,197 @@ +#!/bin/bash +# Task: https://github.com/restream/reindexer/-/issues/1188 +set -e + +function KillAndRemoveServer { + local pid=$1 + kill $pid + wait $pid + yum remove -y 'reindexer*' > /dev/null +} + +function WaitForDB { + # wait until DB is loaded + set +e # disable "exit on error" so the script won't stop when DB's not loaded yet + is_connected=$(reindexer_tool --dsn $ADDRESS --command '\databases list'); + while [[ $is_connected != "test" ]] + do + sleep 2 + is_connected=$(reindexer_tool --dsn $ADDRESS --command '\databases list'); + done + set -e +} + +function CompareNamespacesLists { + local ns_list_actual=$1 + local ns_list_expected=$2 + local pid=$3 + + diff=$(echo ${ns_list_actual[@]} ${ns_list_expected[@]} | tr ' ' '\n' | sort | uniq -u) # compare in any order + if [ "$diff" == "" ]; then + echo "## PASS: namespaces list not changed" + else + echo "##### FAIL: namespaces list was changed" + echo "expected: $ns_list_expected" + echo "actual: $ns_list_actual" + KillAndRemoveServer $pid; + exit 1 + fi +} + +function CompareMemstats { + local actual=$1 + local expected=$2 + local pid=$3 + diff=$(echo ${actual[@]} ${expected[@]} | tr ' ' '\n' | sed 's/\(.*\),$/\1/' | sort | uniq -u) # compare in any order + if [ "$diff" == "" ]; then + echo "## PASS: memstats not changed" + else + echo "##### FAIL: memstats was changed" + echo "expected: $expected" + echo "actual: $actual" + KillAndRemoveServer $pid; + exit 1 + fi +} + + +RX_SERVER_CURRENT_VERSION_RPM="$(basename build/reindexer-*server*.rpm)" +VERSION_FROM_RPM=$(echo "$RX_SERVER_CURRENT_VERSION_RPM" | grep -o '.*server-..') +VERSION=$(echo ${VERSION_FROM_RPM: -2:1}) # one-digit version + +echo "## choose latest release rpm file" +if [ $VERSION == 3 ]; then + LATEST_RELEASE=$(python3 cpp_src/cmd/reindexer_server/test/get_last_rx_version.py -v 3) + namespaces_list_expected=$'purchase_options_ext_dict\nchild_account_recommendations\n#config\n#activitystats\nradio_channels\ncollections\n#namespaces\nwp_imports_tasks\nepg_genres\nrecom_media_items_personal\nrecom_epg_archive_default\n#perfstats\nrecom_epg_live_default\nmedia_view_templates\nasset_video_servers\nwp_tasks_schedule\nadmin_roles\n#clientsstats\nrecom_epg_archive_personal\nrecom_media_items_similars\nmenu_items\naccount_recommendations\nkaraoke_items\nmedia_items\nbanners\n#queriesperfstats\nrecom_media_items_default\nrecom_epg_live_personal\nservices\n#memstats\nchannels\nmedia_item_recommendations\nwp_tasks_tasks\nepg' +elif [ $VERSION == 4 ]; then + LATEST_RELEASE=$(python3 cpp_src/cmd/reindexer_server/test/get_last_rx_version.py -v 4) + # replicationstats ns added for v4 + namespaces_list_expected=$'purchase_options_ext_dict\nchild_account_recommendations\n#config\n#activitystats\n#replicationstats\nradio_channels\ncollections\n#namespaces\nwp_imports_tasks\nepg_genres\nrecom_media_items_personal\nrecom_epg_archive_default\n#perfstats\nrecom_epg_live_default\nmedia_view_templates\nasset_video_servers\nwp_tasks_schedule\nadmin_roles\n#clientsstats\nrecom_epg_archive_personal\nrecom_media_items_similars\nmenu_items\naccount_recommendations\nkaraoke_items\nmedia_items\nbanners\n#queriesperfstats\nrecom_media_items_default\nrecom_epg_live_personal\nservices\n#memstats\nchannels\nmedia_item_recommendations\nwp_tasks_tasks\nepg' +else + echo "Unknown version" + exit 1 +fi + +echo "## downloading latest release rpm file: $LATEST_RELEASE" +curl "http://repo.itv.restr.im/itv-api-ng/7/x86_64/$LATEST_RELEASE" --output $LATEST_RELEASE; +echo "## downloading example DB" +curl "https://github.com/restream/reindexer_testdata/-/raw/main/dump_demo.zip" --output dump_demo.zip; +unzip -o dump_demo.zip # unzips into demo_test.rxdump; + +ADDRESS="cproto://127.0.0.1:6534/" +DB_NAME="test" + +memstats_expected=$'[ +{"name":"account_recommendations","replication":{"data_hash":6833710705,"data_count":1}}, +{"name":"admin_roles","replication":{"data_hash":1896088071,"data_count":2}}, +{"name":"asset_video_servers","replication":{"data_hash":7404222244,"data_count":97}}, +{"name":"banners","replication":{"data_hash":0,"data_count":0}}, +{"name":"channels","replication":{"data_hash":457292509431319,"data_count":3941}}, +{"name":"child_account_recommendations","replication":{"data_hash":6252344969,"data_count":1}}, +{"name":"collections","replication":{"data_hash":0,"data_count":0}}, +{"name":"epg","replication":{"data_hash":-7049751653258,"data_count":1623116}}, +{"name":"epg_genres","replication":{"data_hash":8373644068,"data_count":1315}}, +{"name":"karaoke_items","replication":{"data_hash":5858155773472,"data_count":4500}}, +{"name":"media_item_recommendations","replication":{"data_hash":-6520334670,"data_count":35886}}, +{"name":"media_items","replication":{"data_hash":-1824301168479972392,"data_count":65448}}, +{"name":"media_view_templates","replication":{"data_hash":0,"data_count":0}}, +{"name":"menu_items","replication":{"data_hash":0,"data_count":0}}, +{"name":"purchase_options_ext_dict","replication":{"data_hash":24651210926,"data_count":3}}, +{"name":"radio_channels","replication":{"data_hash":37734732881,"data_count":28}}, +{"name":"recom_epg_archive_default","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_epg_archive_personal","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_epg_live_default","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_epg_live_personal","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_media_items_default","replication":{"data_hash":8288213744,"data_count":3}}, +{"name":"recom_media_items_personal","replication":{"data_hash":0,"data_count":0}}, +{"name":"recom_media_items_similars","replication":{"data_hash":-672103903,"data_count":33538}}, +{"name":"services","replication":{"data_hash":0,"data_count":0}}, +{"name":"wp_imports_tasks","replication":{"data_hash":777859741066,"data_count":1145}}, +{"name":"wp_tasks_schedule","replication":{"data_hash":12595790956,"data_count":4}}, +{"name":"wp_tasks_tasks","replication":{"data_hash":28692716680,"data_count":281}} +] +Returned 27 rows' + +echo "##### Forward compatibility test #####" + +DB_PATH=$(pwd)"/rx_db" + +echo "Database: "$DB_PATH + +echo "## installing latest release: $LATEST_RELEASE" +yum install -y $LATEST_RELEASE > /dev/null; +# run RX server with disabled logging +reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +reindexer_tool --dsn $ADDRESS$DB_NAME -f demo_test.rxdump --createdb; +sleep 1; + +namespaces_1=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_1; +CompareNamespacesLists "${namespaces_1[@]}" "${namespaces_list_expected[@]}" $server_pid; + +memstats_1=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_1[@]}" "${memstats_expected[@]}" $server_pid; + +KillAndRemoveServer $server_pid; + +echo "## installing current version: $RX_SERVER_CURRENT_VERSION_RPM" +yum install -y build/*.rpm > /dev/null; +reindexer_server -l0 --corelog=none --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +WaitForDB + +namespaces_2=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_2; +CompareNamespacesLists "${namespaces_2[@]}" "${namespaces_1[@]}" $server_pid; + + +memstats_2=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_2[@]}" "${memstats_1[@]}" $server_pid; + +KillAndRemoveServer $server_pid; +rm -rf $DB_PATH; +sleep 1; + +echo "##### Backward compatibility test #####" + +echo "## installing current version: $RX_SERVER_CURRENT_VERSION_RPM" +yum install -y build/*.rpm > /dev/null; +reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +reindexer_tool --dsn $ADDRESS$DB_NAME -f demo_test.rxdump --createdb; +sleep 1; + +namespaces_3=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_3; +CompareNamespacesLists "${namespaces_3[@]}" "${namespaces_list_expected[@]}" $server_pid; + + +memstats_3=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_3[@]}" "${memstats_expected[@]}" $server_pid; + +KillAndRemoveServer $server_pid; + +echo "## installing latest release: $LATEST_RELEASE" +yum install -y $LATEST_RELEASE > /dev/null; +reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH & +server_pid=$! +sleep 2; + +WaitForDB + +namespaces_4=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list'); +echo $namespaces_4; +CompareNamespacesLists "${namespaces_4[@]}" "${namespaces_3[@]}" $server_pid; + +memstats_4=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select name, replication.data_hash, replication.data_count from #memstats order by name'); +CompareMemstats "${memstats_4[@]}" "${memstats_3[@]}" $server_pid; + +KillAndRemoveServer $server_pid; +rm -rf $DB_PATH; diff --git a/cpp_src/core/cjson/cjsondecoder.cc b/cpp_src/core/cjson/cjsondecoder.cc index 3064f14e8..b1ab6a5f4 100644 --- a/cpp_src/core/cjson/cjsondecoder.cc +++ b/cpp_src/core/cjson/cjsondecoder.cc @@ -40,11 +40,12 @@ bool CJsonDecoder::decodeCJson(Payload& pl, Serializer& rdser, WrSerializer& wrs const auto& fieldRef{pl.Type().Field(field)}; const KeyValueType fieldType{fieldRef.Type()}; if (tagType == TAG_ARRAY) { + const carraytag atag = rdser.GetCArrayTag(); + const auto count = atag.Count(); if rx_unlikely (!fieldRef.IsArray()) { throwUnexpectedArrayError(fieldRef); } - const carraytag atag = rdser.GetCArrayTag(); - const auto count = atag.Count(); + validateArrayFieldRestrictions(fieldRef, count, "cjson"); const int ofs = pl.ResizeArray(field, count, true); const TagType atagType = atag.Type(); if (atagType != TAG_OBJECT) { @@ -61,6 +62,7 @@ bool CJsonDecoder::decodeCJson(Payload& pl, Serializer& rdser, WrSerializer& wrs wrser.PutVarUint(count); } else { validateNonArrayFieldRestrictions(objectScalarIndexes_, pl, fieldRef, field, isInArray(), "cjson"); + validateArrayFieldRestrictions(fieldRef, 1, "cjson"); objectScalarIndexes_.set(field); pl.Set(field, cjsonValueToVariant(tagType, rdser, fieldType), true); fieldType.EvaluateOneOf( diff --git a/cpp_src/core/cjson/cjsontools.cc b/cpp_src/core/cjson/cjsontools.cc index ecce2bdc4..f08e4d8b0 100644 --- a/cpp_src/core/cjson/cjsontools.cc +++ b/cpp_src/core/cjson/cjsontools.cc @@ -207,6 +207,11 @@ void throwScalarMultipleEncodesError(const Payload& pl, const PayloadFieldType& throw Error(errLogic, "Non-array field '%s' [%d] from '%s' can only be encoded once.", f.Name(), field, pl.Type().Name()); } +void throwUnexpectedArraySizeError(std::string_view parserName, const PayloadFieldType& f, int arraySize) { + throw Error(errParams, "%s array field '%s' for this index type must contain %d elements, but got %d", parserName, f.Name(), + f.ArrayDim(), arraySize); +} + static void dumpCjsonValue(TagType type, Serializer& cjson, std::ostream& dump) { switch (type) { case TAG_VARINT: diff --git a/cpp_src/core/cjson/cjsontools.h b/cpp_src/core/cjson/cjsontools.h index 93c618524..b0a10dec2 100644 --- a/cpp_src/core/cjson/cjsontools.h +++ b/cpp_src/core/cjson/cjsontools.h @@ -18,6 +18,7 @@ void skipCjsonTag(ctag tag, Serializer& rdser, std::array [[noreturn]] void throwUnexpectedNestedArrayError(std::string_view parserName, const PayloadFieldType& f); [[noreturn]] void throwScalarMultipleEncodesError(const Payload& pl, const PayloadFieldType& f, int field); +[[noreturn]] void throwUnexpectedArraySizeError(std::string_view parserName, const PayloadFieldType& f, int arraySize); RX_ALWAYS_INLINE void validateNonArrayFieldRestrictions(const ScalarIndexesSetT& scalarIndexes, const Payload& pl, const PayloadFieldType& f, int field, bool isInArray, std::string_view parserName) { if (!f.IsArray()) { @@ -30,6 +31,14 @@ RX_ALWAYS_INLINE void validateNonArrayFieldRestrictions(const ScalarIndexesSetT& } } +RX_ALWAYS_INLINE void validateArrayFieldRestrictions(const PayloadFieldType& f, int arraySize, std::string_view parserName) { + if (f.IsArray()) { + if rx_unlikely (arraySize && f.ArrayDim() > 0 && f.ArrayDim() != arraySize) { + throwUnexpectedArraySizeError(parserName, f, arraySize); + } + } +} + void DumpCjson(Serializer& cjson, std::ostream& dump, const ConstPayload*, const TagsMatcher* = nullptr, std::string_view tab = " "); inline void DumpCjson(Serializer&& cjson, std::ostream& dump, const ConstPayload* pl, const TagsMatcher* tm = nullptr, std::string_view tab = " ") { @@ -49,4 +58,21 @@ inline void DumpCjson(Serializer&& cjson, std::ostream& dump, const TagsMatcher* DumpCjson(cjson, dump, tm, tab); } +static inline Variant convertValueForPayload(Payload& pl, int field, Variant&& value, std::string_view source) { + if (field < 0) { + return value; + } + + auto plFieldType = pl.Type().Field(field).Type(); + if (plFieldType.IsSame(value.Type())) { + return value; + } else if ((plFieldType.IsNumeric() && value.Type().IsNumeric()) || + (plFieldType.Is() && value.Type().Is())) { + return value.convert(pl.Type().Field(field).Type()); + } else { + throw Error(errLogic, "Error parsing %s field '%s' - got %s, expected %s", source, pl.Type().Field(field).Name(), + value.Type().Name(), plFieldType.Name()); + } +} + } // namespace reindexer diff --git a/cpp_src/core/cjson/jsondecoder.cc b/cpp_src/core/cjson/jsondecoder.cc index 7696741cb..2a0e2eba8 100644 --- a/cpp_src/core/cjson/jsondecoder.cc +++ b/cpp_src/core/cjson/jsondecoder.cc @@ -52,6 +52,7 @@ void JsonDecoder::decodeJsonObject(Payload& pl, CJsonBuilder& builder, const gas (void)subelem; ++count; } + validateArrayFieldRestrictions(f, count, "json"); int pos = pl.ResizeArray(field, count, true); for (auto& subelem : elem.value) { pl.Set(field, pos++, jsonValue2Variant(subelem.value, f.Type(), f.Name())); @@ -70,6 +71,7 @@ void JsonDecoder::decodeJsonObject(Payload& pl, CJsonBuilder& builder, const gas case gason::JSON_TRUE: case gason::JSON_FALSE: { validateNonArrayFieldRestrictions(objectScalarIndexes_, pl, f, field, isInArray(), "json"); + validateArrayFieldRestrictions(f, 1, "json"); objectScalarIndexes_.set(field); Variant value = jsonValue2Variant(elem.value, f.Type(), f.Name()); builder.Ref(tagName, value, field); @@ -150,7 +152,10 @@ class TagsPathGuard { void JsonDecoder::decodeJsonObject(const gason::JsonValue& root, CJsonBuilder& builder) { for (const auto& elem : root) { - int tagName = tagsMatcher_.name2tag(elem.key, true); + const int tagName = tagsMatcher_.name2tag(elem.key, true); + if (tagName == 0) { + throw Error(errParseJson, "Unsupported JSON format. Unnamed field detected"); + } TagsPathGuard tagsPathGuard(tagsPath_, tagName); decodeJson(nullptr, builder, elem.value, tagName, true); } diff --git a/cpp_src/core/cjson/msgpackdecoder.cc b/cpp_src/core/cjson/msgpackdecoder.cc index 8b72c9f21..0c7cf525c 100644 --- a/cpp_src/core/cjson/msgpackdecoder.cc +++ b/cpp_src/core/cjson/msgpackdecoder.cc @@ -11,22 +11,20 @@ template void MsgPackDecoder::setValue(Payload& pl, CJsonBuilder& builder, const T& value, int tagName) { int field = tm_.tags2field(tagsPath_.data(), tagsPath_.size()); if (field > 0) { - validateNonArrayFieldRestrictions(objectScalarIndexes_, pl, pl.Type().Field(field), field, isInArray(), "msgpack"); + const auto& f = pl.Type().Field(field); + validateNonArrayFieldRestrictions(objectScalarIndexes_, pl, f, field, isInArray(), "msgpack"); + if (!isInArray()) { + validateArrayFieldRestrictions(f, 1, "msgpack"); + } Variant val(value); builder.Ref(tagName, val, field); - pl.Set(field, std::move(val), true); + pl.Set(field, convertValueForPayload(pl, field, std::move(val), "msgpack")); objectScalarIndexes_.set(field); } else { builder.Put(tagName, value); } } -void MsgPackDecoder::iterateOverArray(const msgpack_object* begin, const msgpack_object* end, Payload& pl, CJsonBuilder& array) { - for (const msgpack_object* p = begin; p != end; ++p) { - decode(pl, array, *p, 0); - } -} - int MsgPackDecoder::decodeKeyToTag(const msgpack_object_kv& obj) { using namespace std::string_view_literals; switch (obj.key.type) { @@ -95,11 +93,43 @@ void MsgPackDecoder::decode(Payload& pl, CJsonBuilder& builder, const msgpack_ob if rx_unlikely (!f.IsArray()) { throw Error(errLogic, "Error parsing msgpack field '%s' - got array, expected scalar %s", f.Name(), f.Type().Name()); } - auto& array = builder.ArrayRef(tagName, field, count); - iterateOverArray(begin, end, pl, array); + validateArrayFieldRestrictions(f, count, "msgpack"); + int pos = pl.ResizeArray(field, count, true); + for (const msgpack_object* p = begin; p != end; ++p) { + pl.Set(field, pos++, + convertValueForPayload( + pl, field, + [&] { + switch (p->type) { + case MSGPACK_OBJECT_BOOLEAN: + return Variant{p->via.boolean}; + case MSGPACK_OBJECT_POSITIVE_INTEGER: + return Variant{int64_t(p->via.u64)}; + case MSGPACK_OBJECT_NEGATIVE_INTEGER: + return Variant{p->via.i64}; + case MSGPACK_OBJECT_FLOAT32: + case MSGPACK_OBJECT_FLOAT64: + return Variant{p->via.f64}; + case MSGPACK_OBJECT_STR: + return Variant{p_string(reinterpret_cast(&p->via.str)), Variant::hold_t{}}; + case MSGPACK_OBJECT_NIL: + case MSGPACK_OBJECT_ARRAY: + case MSGPACK_OBJECT_MAP: + case MSGPACK_OBJECT_BIN: + case MSGPACK_OBJECT_EXT: + default: + throw Error(errParams, "Unsupported MsgPack array field type: %s(%d)", ToString(p->type), + int(p->type)); + } + }(), + "msgpack")); + } + builder.ArrayRef(tagName, field, count); } else { auto array = builder.Array(tagName, type); - iterateOverArray(begin, end, pl, array); + for (const msgpack_object* p = begin; p != end; ++p) { + decode(pl, array, *p, 0); + } } break; } diff --git a/cpp_src/core/cjson/msgpackdecoder.h b/cpp_src/core/cjson/msgpackdecoder.h index 042b05263..e89f2ff8e 100644 --- a/cpp_src/core/cjson/msgpackdecoder.h +++ b/cpp_src/core/cjson/msgpackdecoder.h @@ -19,7 +19,6 @@ class MsgPackDecoder { private: void decode(Payload& pl, CJsonBuilder& builder, const msgpack_object& obj, int tagName); - void iterateOverArray(const msgpack_object* begin, const msgpack_object* end, Payload& pl, CJsonBuilder& builder); int decodeKeyToTag(const msgpack_object_kv& obj); diff --git a/cpp_src/core/cjson/protobufdecoder.cc b/cpp_src/core/cjson/protobufdecoder.cc index 7d2dcc04a..45d43d53b 100644 --- a/cpp_src/core/cjson/protobufdecoder.cc +++ b/cpp_src/core/cjson/protobufdecoder.cc @@ -1,4 +1,5 @@ #include "protobufdecoder.h" +#include "core/cjson/cjsontools.h" #include "core/schema.h" #include "estl/protobufparser.h" @@ -51,9 +52,10 @@ void ProtobufDecoder::setValue(Payload& pl, CJsonBuilder& builder, ProtobufValue if (item.isArray) { arraysStorage_.UpdateArraySize(item.tagName, field); } else { + validateArrayFieldRestrictions(f, 1, "protobuf"); builder.Ref(item.tagName, value, field); } - pl.Set(field, std::move(value), true); + pl.Set(field, convertValueForPayload(pl, field, std::move(value), "protobuf"), true); objectScalarIndexes_.set(field); } else { if (item.isArray) { @@ -78,13 +80,14 @@ Error ProtobufDecoder::decodeArray(Payload& pl, CJsonBuilder& builder, const Pro if (packed) { int count = 0; while (!parser.IsEof()) { - pl.Set(field, parser.ReadArrayItem(item.itemType), true); + pl.Set(field, convertValueForPayload(pl, field, parser.ReadArrayItem(item.itemType), "protobuf"), true); ++count; } builder.ArrayRef(item.tagName, field, count); } else { setValue(pl, builder, item); } + validateArrayFieldRestrictions(f, reinterpret_cast(pl.Field(field).p_)->len, "protobuf"); } else { CJsonBuilder& array = arraysStorage_.GetArray(item.tagName); if (packed) { diff --git a/cpp_src/core/ft/config/ftfastconfig.cc b/cpp_src/core/ft/config/ftfastconfig.cc index 5a1ec4cb0..e900abf27 100644 --- a/cpp_src/core/ft/config/ftfastconfig.cc +++ b/cpp_src/core/ft/config/ftfastconfig.cc @@ -131,8 +131,8 @@ void FtFastConfig::parse(std::string_view json, const RHashMap const std::string splitterStr = toLower(root["splitter"].As("fast")); if (splitterStr == "fast") { splitterType = Splitter::Fast; - } else if (splitterStr == "friso") { - splitterType = Splitter::Friso; + } else if (splitterStr == "friso" || splitterStr == "mmseg_cn") { + splitterType = Splitter::MMSegCN; } else { throw Error(errParseJson, "FtFastConfig: unknown splitter value: %s", splitterStr); } @@ -185,8 +185,8 @@ std::string FtFastConfig::GetJson(const fast_hash_map& fields) case Splitter::Fast: jsonBuilder.Put("splitter", "fast"); break; - case Splitter::Friso: - jsonBuilder.Put("splitter", "friso"); + case Splitter::MMSegCN: + jsonBuilder.Put("splitter", "mmseg_cn"); break; } diff --git a/cpp_src/core/ft/config/ftfastconfig.h b/cpp_src/core/ft/config/ftfastconfig.h index 76a9a99bd..3e8fdfd59 100644 --- a/cpp_src/core/ft/config/ftfastconfig.h +++ b/cpp_src/core/ft/config/ftfastconfig.h @@ -53,7 +53,7 @@ struct FtFastConfig : public BaseFTConfig { int maxAreasInDoc = 5; int maxTotalAreasToCache = -1; - enum class Splitter { Fast, Friso } splitterType = Splitter::Fast; + enum class Splitter { Fast, MMSegCN } splitterType = Splitter::Fast; RVector fieldsCfg; enum class Optimization { CPU, Memory } optimization = Optimization::Memory; diff --git a/cpp_src/core/ft/ft_fast/dataholder.cc b/cpp_src/core/ft/ft_fast/dataholder.cc index 55ab52d97..fad988b72 100644 --- a/cpp_src/core/ft/ft_fast/dataholder.cc +++ b/cpp_src/core/ft/ft_fast/dataholder.cc @@ -132,7 +132,7 @@ DataHolder::DataHolder(FtFastConfig* c) { cfg_ = c; if (cfg_->splitterType == FtFastConfig::Splitter::Fast) { splitter_ = make_intrusive(cfg_->extraWordSymbols); - } else if (cfg_->splitterType == FtFastConfig::Splitter::Friso) { + } else if (cfg_->splitterType == FtFastConfig::Splitter::MMSegCN) { splitter_ = make_intrusive(); } else { assertrx_throw(false); diff --git a/cpp_src/core/ft/ft_fast/dataholder.h b/cpp_src/core/ft/ft_fast/dataholder.h index 360a274e5..39b93ffff 100644 --- a/cpp_src/core/ft/ft_fast/dataholder.h +++ b/cpp_src/core/ft/ft_fast/dataholder.h @@ -50,17 +50,16 @@ class PackedWordEntry { class WordEntry { public: WordEntry() noexcept = default; - WordEntry(const IdRelSet& _vids, bool _virtualWord) : vids(_vids), virtualWord(_virtualWord) {} + WordEntry(const IdRelSet& _vids) : vids_(_vids) {} WordEntry(const WordEntry&) = delete; WordEntry(WordEntry&&) noexcept = default; WordEntry& operator=(const WordEntry&) = delete; WordEntry& operator=(WordEntry&&) noexcept = default; // Explicit copy - WordEntry MakeCopy() const { return WordEntry(this->vids, this->virtualWord); } + WordEntry MakeCopy() const { return WordEntry(vids_); } - IdRelSet vids; - bool virtualWord = false; + IdRelSet vids_; }; enum ProcessStatus { FullRebuild, RecommitLast, CreateNew }; diff --git a/cpp_src/core/ft/ft_fast/dataprocessor.cc b/cpp_src/core/ft/ft_fast/dataprocessor.cc index 240bfc8df..eefe51c88 100644 --- a/cpp_src/core/ft/ft_fast/dataprocessor.cc +++ b/cpp_src/core/ft/ft_fast/dataprocessor.cc @@ -14,8 +14,6 @@ using std::chrono::milliseconds; namespace reindexer { -constexpr int kDigitUtfSizeof = 1; - template void DataProcessor::Process(bool multithread) { ExceptionPtrWrapper exwr; @@ -77,7 +75,6 @@ typename DataProcessor::WordsVector DataProcessor::insertIntoSuf auto& suffix = holder.GetSuffix(); suffix.reserve(words_um.size() * 20, words_um.size()); - const bool enableNumbersSearch = holder.cfg_->enableNumbersSearch; WordsVector found; found.reserve(words_um.size()); @@ -96,11 +93,7 @@ typename DataProcessor::WordsVector DataProcessor::insertIntoSuf words.emplace_back(); pos = holder.BuildWordId(id); - if (enableNumbersSearch && keyIt.second.virtualWord) { - suffix.insert(keyIt.first, pos, kDigitUtfSizeof); - } else { - suffix.insert(keyIt.first, pos); - } + suffix.insert(keyIt.first, pos); } return found; } @@ -127,9 +120,9 @@ size_t DataProcessor::commitIdRelSets(const WordsVector& preprocWords, w idsetcnt += sizeof(*wIt); } - word->vids.insert(word->vids.end(), std::make_move_iterator(keyIt->second.vids.begin()), - std::make_move_iterator(keyIt->second.vids.end())); - keyIt->second.vids = IdRelSet(); + word->vids.insert(word->vids.end(), std::make_move_iterator(keyIt->second.vids_.begin()), + std::make_move_iterator(keyIt->second.vids_.end())); + keyIt->second.vids_ = IdRelSet(); word->vids.shrink_to_fit(); idsetcnt += word->vids.heap_size(); } @@ -240,7 +233,7 @@ size_t DataProcessor::buildWordsMap(words_map& words_um, bool multithrea // build words map parallel in maxIndexWorkers threads auto worker = [this, &ctxs, &vdocsTexts, offset, fieldscount, &cfg, &vdocs, &textSplitter](int i) { auto ctx = &ctxs[i]; - std::vector virtualWords; + std::vector virtualWords; const size_t start = ctx->from; const size_t fin = ctx->to; const bool enableNumbersSearch = cfg->enableNumbersSearch; @@ -271,7 +264,7 @@ size_t DataProcessor::buildWordsMap(words_map& words_um, bool multithrea auto [idxIt, emplaced] = ctx->words_um.try_emplace_prehashed(whash, word); (void)emplaced; - const int mfcnt = idxIt->second.vids.Add(vdocId, insertPos, rfield); + const int mfcnt = idxIt->second.vids_.Add(vdocId, insertPos, rfield); if (mfcnt > vdoc.mostFreqWordCount[rfield]) { vdoc.mostFreqWordCount[rfield] = mfcnt; } @@ -302,19 +295,18 @@ size_t DataProcessor::buildWordsMap(words_map& words_um, bool multithrea #if defined(RX_WITH_STDLIB_DEBUG) || defined(REINDEX_WITH_ASAN) const auto fBeforeMove = it.first; const auto sBeforeMove = it.second.MakeCopy(); - const auto sCapacityBeforeMove = it.second.vids.capacity(); + const auto sCapacityBeforeMove = it.second.vids_.capacity(); #endif // defined(RX_WITH_STDLIB_DEBUG) || defined(REINDEX_WITH_ASAN) auto [idxIt, emplaced] = words_um.try_emplace(std::move(it.first), std::move(it.second)); if (!emplaced) { #if defined(RX_WITH_STDLIB_DEBUG) || defined(REINDEX_WITH_ASAN) // Make sure, that try_emplace did not moved the values assertrx(it.first == fBeforeMove); - assertrx(it.second.virtualWord == sBeforeMove.virtualWord); - assertrx(it.second.vids.size() == sBeforeMove.vids.size()); - assertrx(it.second.vids.capacity() == sCapacityBeforeMove); + assertrx(it.second.vids_.size() == sBeforeMove.vids_.size()); + assertrx(it.second.vids_.capacity() == sCapacityBeforeMove); #endif // defined(RX_WITH_STDLIB_DEBUG) || defined(REINDEX_WITH_ASAN) - auto& resultVids = idxIt->second.vids; - auto& newVids = it.second.vids; + auto& resultVids = idxIt->second.vids_; + auto& newVids = it.second.vids_; resultVids.insert(resultVids.end(), std::make_move_iterator(newVids.begin()), std::make_move_iterator(newVids.end())); } } @@ -347,8 +339,8 @@ size_t DataProcessor::buildWordsMap(words_map& words_um, bool multithrea if (holder_.cfg_->logLevel >= LogInfo) { WrSerializer out; for (auto& w : words_um) { - if (w.second.vids.size() > vdocs.size() / 5 || int64_t(w.second.vids.size()) > holder_.cfg_->mergeLimit) { - out << w.first << "(" << w.second.vids.size() << ") "; + if (w.second.vids_.size() > vdocs.size() / 5 || int64_t(w.second.vids_.size()) > holder_.cfg_->mergeLimit) { + out << w.first << "(" << w.second.vids_.size() << ") "; } } logPrintf(LogInfo, "Total documents: %d. Potential stop words (with corresponding docs count): %s", vdocs.size(), out.Slice()); @@ -359,19 +351,17 @@ size_t DataProcessor::buildWordsMap(words_map& words_um, bool multithrea template void DataProcessor::buildVirtualWord(std::string_view word, words_map& words_um, VDocIdType docType, int rfield, size_t insertPos, - std::vector& container) { + std::vector& container) { auto& vdoc(holder_.vdocs_[docType]); NumToText::convert(word, container); - for (std::string& numberWord : container) { + for (const auto numberWord : container) { WordEntry wentry; - wentry.virtualWord = true; - auto idxIt = words_um.emplace(std::move(numberWord), std::move(wentry)).first; - const int mfcnt = idxIt->second.vids.Add(docType, insertPos, rfield); + auto idxIt = words_um.emplace(numberWord, std::move(wentry)).first; + const int mfcnt = idxIt->second.vids_.Add(docType, insertPos, rfield); if (mfcnt > vdoc.mostFreqWordCount[rfield]) { vdoc.mostFreqWordCount[rfield] = mfcnt; } ++vdoc.wordsCount[rfield]; - insertPos += kDigitUtfSizeof; } } diff --git a/cpp_src/core/ft/ft_fast/dataprocessor.h b/cpp_src/core/ft/ft_fast/dataprocessor.h index 50fe679fe..936e435b3 100644 --- a/cpp_src/core/ft/ft_fast/dataprocessor.h +++ b/cpp_src/core/ft/ft_fast/dataprocessor.h @@ -86,7 +86,7 @@ class DataProcessor { [[nodiscard]] size_t buildWordsMap(words_map& m, bool multithread, intrusive_ptr textSplitter); void buildVirtualWord(std::string_view word, words_map& words_um, VDocIdType docType, int rfield, size_t insertPos, - std::vector& container); + std::vector& container); void buildTyposMap(uint32_t startPos, const WordsVector& preprocWords); [[nodiscard]] static WordsVector insertIntoSuffix(words_map& words_um, DataHolder& holder); [[nodiscard]] static size_t commitIdRelSets(const WordsVector& preprocWords, words_map& words_um, DataHolder& holder, diff --git a/cpp_src/core/ft/numtotext.cc b/cpp_src/core/ft/numtotext.cc index 44fe81df2..b7405f653 100644 --- a/cpp_src/core/ft/numtotext.cc +++ b/cpp_src/core/ft/numtotext.cc @@ -1,163 +1,100 @@ #include "numtotext.h" - -#include -#include -#include #include "tools/errors.h" namespace reindexer { constexpr std::string_view units[] = {"", "один", "два", "три", "четыре", "пять", "шесть", "семь", "восемь", "девять"}; -constexpr std::string_view unitsNominat[] = {"", "одна", "две"}; +constexpr std::string_view unitsNominat[] = {"", "одна", "две", "три", "четыре", "пять", "шесть", "семь", "восемь", "девять"}; constexpr std::string_view tens[] = {"", "одиннадцать", "двенадцать", "тринадцать", "четырнадцать", "пятнадцать", "шестнадцать", "семнадцать", "восемнадцать", "девятнадцать"}; constexpr std::string_view decades[] = {"", "десять", "двадцать", "тридцать", "сорок", "пятьдесят", "шестьдесят", "семьдесят", "восемьдесят", "девяносто"}; constexpr std::string_view hundreads[] = {"", "сто", "двести", "триста", "четыреста", "пятьсот", "шестьсот", "семьсот", "восемьсот", "девятьсот"}; -constexpr std::string_view thousands[] = {"тысяча", "тысячи", "тысяч"}; -constexpr std::string_view millions[] = {"миллион", "миллиона", "миллионов"}; -constexpr std::string_view billions[] = {"миллиард", "миллиарда", "миллиардов"}; -constexpr std::string_view trillions[] = {"триллион", "триллиона", "триллионов"}; -constexpr std::string_view quadrillion[] = {"квадриллион", "квадриллиона", "квадриллионов"}; -constexpr std::string_view quintillion[] = {"квинтиллион", "квинтиллиона", "квинтиллионов"}; -constexpr std::string_view sextillion[] = {"секстиллион", "секстиллиона", "секстиллионов"}; -constexpr std::string_view septillion[] = {"септиллион", "септиллиона", "септиллионов"}; -enum Numorders : int { Thousands, Millions, Billions, Trillions, Quadrillion, Quintillion, Sextillion, Septillion }; +// clang-format off +constexpr static std::string_view kNumOrders[][10] = { +// 0 1 2 3 4 5 6 7 8 9 + {"тысяч", "тысяча", "тысячи", "тысячи", "тысячи", "тысяч", "тысяч", "тысяч", "тысяч", "тысяч"}, + {"миллионов", "миллион", "миллиона", "миллиона", "миллиона", "миллионов", "миллионов", "миллионов", "миллионов", "миллионов"}, + {"миллиардов", "миллиард", "миллиарда", "миллиарда", "миллиарда", "миллиардов", "миллиардов", "миллиардов", "миллиардов", "миллиардов"}, + {"триллионов", "триллион", "триллиона", "триллиона", "триллиона", "триллионов", "триллионов", "триллионов", "триллионов", "триллионов"}, + {"квадриллионов", "квадриллион", "квадриллиона", "квадриллиона", "квадриллиона", "квадриллионов", "квадриллионов", "квадриллионов", "квадриллионов", "квадриллионов"}, + {"квинтиллионов", "квинтиллион", "квинтиллиона", "квинтиллиона", "квинтиллиона", "квинтиллионов", "квинтиллионов", "квинтиллионов", "квинтиллионов", "квинтиллионов"}, + {"секстиллионов", "секстиллион", "секстиллиона", "секстиллиона", "секстиллиона", "секстиллионов", "секстиллионов", "секстиллионов", "секстиллионов", "секстиллионов"}, + {"септиллионов", "септиллион", "септиллиона", "септиллиона", "септиллиона", "септиллионов", "септиллионов", "септиллионов", "септиллионов", "септиллионов"}}; +// clang-format on +RX_ALWAYS_INLINE static int ansiCharacterToDigit(char ch) noexcept { return static_cast(ch - 48); } -static std::string_view getNumorder(int numorder, int i) { - switch (numorder) { - case Thousands: - return thousands[i]; - case Millions: - return millions[i]; - case Billions: - return billions[i]; - case Trillions: - return trillions[i]; - case Quadrillion: - return quadrillion[i]; - case Quintillion: - return quintillion[i]; - case Sextillion: - return sextillion[i]; - case Septillion: - return septillion[i]; - default: - throw Error(errParams, "Incorrect order [%s]: too big", numorder); +static std::vector& formTextString(std::string_view str, std::vector& words) { + if (str.empty()) { + return words; } -} - -RX_ALWAYS_INLINE int ansiCharacterToDigit(char ch) noexcept { return static_cast(ch - 48); } - -static std::vector getOrders(std::string_view str) { - std::string numStr(str); - std::reverse(numStr.begin(), numStr.end()); - int numChars = numStr.length(); - std::vector orders; - orders.reserve(numChars / 3); - for (int i = 0; i < numChars; i += 3) { - std::string tempString; - if (i <= numChars - 3) { - tempString += numStr[i + 2]; - tempString += numStr[i + 1]; - tempString += numStr[i]; - } else { - int lostChars = numChars - i; - switch (lostChars) { - case 1: - tempString = numStr[i]; - break; - case 2: - tempString += numStr[i + 1]; - tempString += numStr[i]; - break; - default: - throw Error(errLogic, "Unexpected lost characters number: %d", lostChars); - } + unsigned int ordersMax = (str.length() - 1) / 3 + 1; + unsigned int orderDigitCount = str.length() - (ordersMax - 1) * 3; + unsigned int baseOffset = 0; + for (int k = ordersMax; k > 0; k--) { + unsigned int hundreadsIndx = 0; + unsigned int tenIndex = 0; + unsigned int numIndex = 0; + switch (orderDigitCount) { + case 1: + numIndex = ansiCharacterToDigit(str[baseOffset]); + break; + case 2: + tenIndex = ansiCharacterToDigit(str[baseOffset]); + numIndex = ansiCharacterToDigit(str[baseOffset + 1]); + break; + case 3: + hundreadsIndx = ansiCharacterToDigit(str[baseOffset]); + tenIndex = ansiCharacterToDigit(str[baseOffset + 1]); + numIndex = ansiCharacterToDigit(str[baseOffset + 2]); + break; + default: + throw Error(errLogic, "Incorrect orderDigitCount %d", orderDigitCount); + } + if (hundreadsIndx != 0) { + words.emplace_back(hundreads[hundreadsIndx]); } - orders.emplace_back(std::move(tempString)); - } - return orders; -} - -static std::vector getDecimal(const std::string& str, int i) { - std::vector words; - int v = std::stoi(str); - if (v < 10) { - words.emplace_back(units[v]); - } else if (v % 10 == 0) { - words.emplace_back(decades[v / 10]); - } else if (v < 20) { - words.emplace_back(tens[v % 10]); - } else if (v % 10 < 3 && i == 1) { - words.emplace_back(decades[ansiCharacterToDigit(str[0])]); - words.emplace_back(unitsNominat[ansiCharacterToDigit(str[1])]); - } else { - words.emplace_back(decades[ansiCharacterToDigit(str[0])]); - words.emplace_back(units[ansiCharacterToDigit(str[1])]); - } - return words; -} -static std::string getNumOrders(int i, int num) { - std::string orders; - if (i > 0) { - if (num % 10 > 4 || (num % 100 > 10 && num % 100 < 20) || num % 10 == 0) { - orders = getNumorder(i - 1, 2); - } else if (num % 10 > 1 && num % 10 < 5) { - orders = getNumorder(i - 1, 1); - } else { - orders = getNumorder(i - 1, 0); + if (tenIndex == 1 && numIndex != 0) { + words.emplace_back(tens[numIndex]); + } else if (tenIndex != 0) { + words.emplace_back(decades[tenIndex]); } - } - return orders; -} -static std::vector formTextString(const std::string& str, int i) { - std::vector words; - int strlen = str.length(); - if (strlen == 3) { - words.emplace_back(hundreads[ansiCharacterToDigit(str[0])]); - std::string decimal; - decimal += str[1]; - decimal += str[2]; - std::vector decimalWords(getDecimal(decimal, i)); - words.insert(words.end(), make_move_iterator(decimalWords.begin()), make_move_iterator(decimalWords.end())); - } else if (strlen == 2) { - words = getDecimal(str, i); - } else { - if ((i == 1) && std::stoi(str) < 3) { - words.emplace_back(unitsNominat[std::stoi(str)]); - } else { - words.emplace_back(units[std::stoi(str)]); + if (numIndex != 0 && tenIndex != 1) { + if (k == 2) { // thousands + words.emplace_back(unitsNominat[numIndex]); + } else { + words.emplace_back(units[numIndex]); + } } - } - if (i > 0) { - words.emplace_back(getNumOrders(i, std::stoi(str))); + bool isAllNull = hundreadsIndx == 0 && tenIndex == 0 && numIndex == 0; + if (k > 1 && !isAllNull) { + words.emplace_back(kNumOrders[k - 2][numIndex]); + } + baseOffset += orderDigitCount; + orderDigitCount = 3; } return words; } -std::vector& NumToText::convert(std::string_view str, std::vector& output) { +std::vector& NumToText::convert(std::string_view str, std::vector& output) { output.resize(0); - if ((str.length() == 1) && (str[0] == '0')) { - output = {"ноль"}; - return output; + unsigned int k = 0; + for (; k < str.length() && str[k] == '0'; ++k) { + output.emplace_back("ноль"); } + str = str.substr(k); // unreasonably big if (str.length() > 27) { + output.resize(0); return output; } - std::vector orders(getOrders(str)); - for (size_t i = 0; i < orders.size(); ++i) { - size_t oppositeSideIndex = orders.size() - 1 - i; - std::vector digits(formTextString(orders[oppositeSideIndex], oppositeSideIndex)); - output.insert(output.end(), make_move_iterator(digits.begin()), make_move_iterator(digits.end())); - } - return output; + + return formTextString(str, output); } } // namespace reindexer diff --git a/cpp_src/core/ft/numtotext.h b/cpp_src/core/ft/numtotext.h index db87cbd38..2c0a5b755 100644 --- a/cpp_src/core/ft/numtotext.h +++ b/cpp_src/core/ft/numtotext.h @@ -10,7 +10,7 @@ namespace reindexer { class NumToText { public: - static std::vector& convert(std::string_view numStr, std::vector& output); + static std::vector& convert(std::string_view numStr, std::vector& output); }; } // namespace reindexer diff --git a/cpp_src/core/idsetcache.h b/cpp_src/core/idsetcache.h index d70d7f266..fcc57d6a0 100644 --- a/cpp_src/core/idsetcache.h +++ b/cpp_src/core/idsetcache.h @@ -76,7 +76,11 @@ T& operator<<(T& os, const IdSetCacheVal& v) { struct equal_idset_cache_key { bool operator()(const IdSetCacheKey& lhs, const IdSetCacheKey& rhs) const noexcept { - return lhs.cond == rhs.cond && lhs.sort == rhs.sort && *lhs.keys == *rhs.keys; + try { + return lhs.cond == rhs.cond && lhs.sort == rhs.sort && *lhs.keys == *rhs.keys; + } catch (...) { + return false; // For non-comparable variant arrays (really rare case in this context) + } } }; struct hash_idset_cache_key { diff --git a/cpp_src/core/index/indexordered.cc b/cpp_src/core/index/indexordered.cc index ba30c2c73..aa824e012 100644 --- a/cpp_src/core/index/indexordered.cc +++ b/cpp_src/core/index/indexordered.cc @@ -58,7 +58,10 @@ SelectKeyResults IndexOrdered::SelectKey(const VariantArray& keys, CondType c SelectKeyResult res; auto startIt = this->idx_map.begin(); auto endIt = this->idx_map.end(); - auto key1 = *keys.begin(); + const auto& key1 = *keys.begin(); + if (key1.IsNullValue() || (keys.size() > 1 && keys[1].IsNullValue())) { + throw Error(errParams, "Can not use 'null'-value with operators '>','<','<=','>=' and 'RANGE()' (index: '%s')", this->Name()); + } switch (condition) { case CondLt: endIt = this->idx_map.lower_bound(static_cast(key1)); diff --git a/cpp_src/core/index/indexunordered.cc b/cpp_src/core/index/indexunordered.cc index 18af49bea..2e62a8621 100644 --- a/cpp_src/core/index/indexunordered.cc +++ b/cpp_src/core/index/indexunordered.cc @@ -301,13 +301,22 @@ SelectKeyResults IndexUnordered::SelectKey(const VariantArray& keys, CondType // Get set of keys or single key case CondEq: case CondSet: { + for (const auto& key : keys) { + if (key.IsNullValue()) { + throw Error(errParams, + "Can not use 'null'-value with operators '=' and 'IN()' (index: '%s'). Use 'IS NULL'/'IS NOT NULL' instead", + this->Name()); + } + } + struct { T* i_map; const VariantArray& keys; + std::string_view indexName; SortType sortId; Index::SelectOpts opts; bool isSparse; - } ctx = {&this->idx_map, keys, sortId, opts, this->opts_.IsSparse()}; + } ctx = {&this->idx_map, keys, this->Name(), sortId, opts, this->opts_.IsSparse()}; bool selectorWasSkipped = false; // should return true, if fallback to comparator required auto selector = [&ctx, &selectorWasSkipped](SelectKeyResult& res, size_t& idsCount) -> bool { @@ -357,7 +366,10 @@ SelectKeyResults IndexUnordered::SelectKey(const VariantArray& keys, CondType case CondAllSet: { // Get set of key, where all request keys are present SelectKeyResults rslts; - for (auto key : keys) { + for (const auto& key : keys) { + if (key.IsNullValue()) { + throw Error(errParams, "Can not use 'null'-value with operator 'allset' (index: '%s')", this->Name()); + } SelectKeyResult res1; auto keyIt = this->idx_map.find(static_cast(key.convert(this->KeyType()))); if (keyIt == this->idx_map.end()) { diff --git a/cpp_src/core/namespace/namespaceimpl.cc b/cpp_src/core/namespace/namespaceimpl.cc index 8805fbd4e..5585100ca 100644 --- a/cpp_src/core/namespace/namespaceimpl.cc +++ b/cpp_src/core/namespace/namespaceimpl.cc @@ -967,7 +967,7 @@ void NamespaceImpl::verifyUpdateIndex(const IndexDef& indexDef) const { FieldsSet changedFields{idxNameIt->second}; PayloadType newPlType = payloadType_; newPlType.Drop(indexDef.name_); - newPlType.Add(PayloadFieldType(newIndex->KeyType(), indexDef.name_, indexDef.jsonPaths_, indexDef.opts_.IsArray())); + newPlType.Add(PayloadFieldType(*newIndex, indexDef)); verifyConvertTypes(oldIndex->KeyType(), newIndex->KeyType(), newPlType, changedFields); } } @@ -1168,7 +1168,7 @@ bool NamespaceImpl::addIndex(const IndexDef& indexDef) { } else { PayloadType oldPlType = payloadType_; auto newIndex = Index::New(indexDef, PayloadType(), FieldsSet(), config_.cacheConfig); - payloadType_.Add(PayloadFieldType{newIndex->KeyType(), indexName, jsonPaths, newIndex->Opts().IsArray()}); + payloadType_.Add(PayloadFieldType(*newIndex, indexDef)); rollbacker.SetOldPayloadType(std::move(oldPlType)); tagsMatcher_.UpdatePayloadType(payloadType_); rollbacker.NeedResetPayloadTypeInTagsMatcher(); @@ -2188,7 +2188,7 @@ void NamespaceImpl::doModifyItem(Item& item, ItemModifyMode mode, const NsContex } for (int field = 1, regularIndexes = indexes_.firstCompositePos(); field < regularIndexes; ++field) { - Index& index = *indexes_[field]; + const Index& index = *indexes_[field]; if (index.Opts().GetCollateMode() == CollateUTF8 && index.KeyType().Is()) { if (index.Opts().IsSparse()) { assertrx(index.Fields().getTagsPathsLength() > 0); diff --git a/cpp_src/core/namespacedef.h b/cpp_src/core/namespacedef.h index 05f12ecd5..6450b262f 100644 --- a/cpp_src/core/namespacedef.h +++ b/cpp_src/core/namespacedef.h @@ -14,7 +14,7 @@ class WrSerializer; struct NamespaceDef { NamespaceDef() = default; - NamespaceDef(const std::string& iname, StorageOpts istorage = StorageOpts().Enabled().CreateIfMissing()) + explicit NamespaceDef(const std::string& iname, StorageOpts istorage = StorageOpts().Enabled().CreateIfMissing()) : name(iname), storage(istorage) {} NamespaceDef& AddIndex(const std::string& iname, const std::string& indexType, const std::string& fieldType, diff --git a/cpp_src/core/nsselecter/comparator/comparator_not_indexed.cc b/cpp_src/core/nsselecter/comparator/comparator_not_indexed.cc index 1f9350ed1..2ed129f86 100644 --- a/cpp_src/core/nsselecter/comparator/comparator_not_indexed.cc +++ b/cpp_src/core/nsselecter/comparator/comparator_not_indexed.cc @@ -25,6 +25,13 @@ ComparatorNotIndexedImplBase::ComparatorNotIndexedImplBase(const VariantAr ComparatorNotIndexedImplBase::ComparatorNotIndexedImplBase(const VariantArray& values) : value1_{GetValue(CondRange, values, 0)}, value2_{GetValue(CondRange, values, 1)} {} +ComparatorNotIndexedImplBase::ComparatorNotIndexedImplBase(const VariantArray& values) : values_{values.size()} { + for (const Variant& v : values) { + throwOnNull(v, CondSet); + values_.insert(v); + } +} + ComparatorNotIndexedImplBase::ComparatorNotIndexedImplBase(const VariantArray& values) : value_{GetValue(CondLike, values, 0)}, valueView_{p_string{value_}} {} @@ -35,6 +42,18 @@ ComparatorNotIndexedImpl::ComparatorNotIndexedImpl(const Var point_{GetValue(CondDWithin, values, 0)}, distance_{GetValue(CondDWithin, values, 1)} {} +reindexer::comparators::ComparatorNotIndexedImpl::ComparatorNotIndexedImpl(const VariantArray& values, + const PayloadType& payloadType, + const TagsPath& fieldPath) + : payloadType_{payloadType}, fieldPath_{fieldPath}, values_{values.size()} { + int i = 0; + for (const Variant& v : values) { + throwOnNull(v, CondAllSet); + values_.emplace(v, i); + ++i; + } +} + template [[nodiscard]] std::string ComparatorNotIndexedImplBase::ConditionStr() const { return fmt::sprintf("%s %s", CondToStr(), value_.As()); diff --git a/cpp_src/core/nsselecter/comparator/comparator_not_indexed.h b/cpp_src/core/nsselecter/comparator/comparator_not_indexed.h index e74ee8843..ad755d104 100644 --- a/cpp_src/core/nsselecter/comparator/comparator_not_indexed.h +++ b/cpp_src/core/nsselecter/comparator/comparator_not_indexed.h @@ -66,11 +66,7 @@ class ComparatorNotIndexedImplBase { template <> class ComparatorNotIndexedImplBase { protected: - ComparatorNotIndexedImplBase(const VariantArray& values) : values_{values.size()} { - for (const Variant& v : values) { - values_.insert(v); - } - } + ComparatorNotIndexedImplBase(const VariantArray& values); ComparatorNotIndexedImplBase(const ComparatorNotIndexedImplBase&) = default; ComparatorNotIndexedImplBase& operator=(const ComparatorNotIndexedImplBase&) = delete; ComparatorNotIndexedImplBase(ComparatorNotIndexedImplBase&&) = default; @@ -364,14 +360,7 @@ class ComparatorNotIndexedImpl : private ComparatorNotIndexed template <> class ComparatorNotIndexedImpl { public: - ComparatorNotIndexedImpl(const VariantArray& values, const PayloadType& payloadType, const TagsPath& fieldPath) - : payloadType_{payloadType}, fieldPath_{fieldPath}, values_{values.size()} { - int i = 0; - for (const Variant& v : values) { - values_.emplace(v, i); - ++i; - } - } + ComparatorNotIndexedImpl(const VariantArray& values, const PayloadType& payloadType, const TagsPath& fieldPath); ComparatorNotIndexedImpl(const ComparatorNotIndexedImpl&) = default; ComparatorNotIndexedImpl& operator=(const ComparatorNotIndexedImpl&) = delete; ComparatorNotIndexedImpl(ComparatorNotIndexedImpl&&) = default; diff --git a/cpp_src/core/nsselecter/comparator/helpers.h b/cpp_src/core/nsselecter/comparator/helpers.h index f5a2d947c..7b51fc5df 100644 --- a/cpp_src/core/nsselecter/comparator/helpers.h +++ b/cpp_src/core/nsselecter/comparator/helpers.h @@ -74,13 +74,20 @@ template } } +inline static void throwOnNull(const Variant& v, CondType cond) { + if (v.IsNullValue()) { + throw Error{errParams, "Can not use 'null'-value directly with '%s' condition in comparator", CondTypeToStr(cond)}; + } +} + template [[nodiscard]] T GetValue(CondType cond, const VariantArray& values, size_t i) { if (values.size() <= i) { throw Error{errQueryExec, "Too many arguments for condition %s", CondTypeToStr(cond)}; - } else { - return GetValue(values[i]); } + const auto& val = values[i]; + throwOnNull(val, cond); + return GetValue(values[i]); } } // namespace comparators diff --git a/cpp_src/core/payload/payloadfieldtype.cc b/cpp_src/core/payload/payloadfieldtype.cc index a66ce588f..2b5f6ac28 100644 --- a/cpp_src/core/payload/payloadfieldtype.cc +++ b/cpp_src/core/payload/payloadfieldtype.cc @@ -1,5 +1,6 @@ #include "payloadfieldtype.h" #include +#include "core/index/index.h" #include "core/keyvalue/p_string.h" #include "core/keyvalue/uuid.h" #include "estl/one_of.h" @@ -7,6 +8,14 @@ namespace reindexer { +PayloadFieldType::PayloadFieldType(const Index& index, const IndexDef& indexDef) noexcept + : type_(index.KeyType()), + name_(indexDef.name_), + jsonPaths_(indexDef.jsonPaths_), + offset_(0), + isArray_(index.Opts().IsArray()), + arrayDim_(indexDef.Type() == IndexType::IndexRTree ? 2 : -1) {} + size_t PayloadFieldType::Sizeof() const noexcept { if (IsArray()) { return sizeof(PayloadFieldValue::Array); diff --git a/cpp_src/core/payload/payloadfieldtype.h b/cpp_src/core/payload/payloadfieldtype.h index 438b3563b..5f6af0b06 100644 --- a/cpp_src/core/payload/payloadfieldtype.h +++ b/cpp_src/core/payload/payloadfieldtype.h @@ -5,16 +5,20 @@ namespace reindexer { +class Index; +struct IndexDef; // Type of field class PayloadFieldType { public: + explicit PayloadFieldType(const Index&, const IndexDef&) noexcept; PayloadFieldType(KeyValueType t, std::string n, std::vector j, bool a) noexcept - : type_(t), name_(std::move(n)), jsonPaths_(std::move(j)), offset_(0), isArray_(a) {} + : type_(t), name_(std::move(n)), jsonPaths_(std::move(j)), offset_(0), isArray_(a), arrayDim_(-1) {} size_t Sizeof() const noexcept; size_t ElemSizeof() const noexcept; size_t Alignof() const noexcept; bool IsArray() const noexcept { return isArray_; } + int8_t ArrayDim() const noexcept { return arrayDim_; } void SetArray() noexcept { isArray_ = true; } void SetOffset(size_t o) noexcept { offset_ = o; } size_t Offset() const noexcept { return offset_; } @@ -32,6 +36,7 @@ class PayloadFieldType { std::vector jsonPaths_; size_t offset_; bool isArray_; + int8_t arrayDim_; }; } // namespace reindexer diff --git a/cpp_src/core/payload/payloadtype.cc b/cpp_src/core/payload/payloadtype.cc index 0f9bfd33d..2cc553053 100644 --- a/cpp_src/core/payload/payloadtype.cc +++ b/cpp_src/core/payload/payloadtype.cc @@ -193,9 +193,6 @@ void PayloadTypeImpl::deserialize(Serializer& ser) { PayloadFieldType ft(t, name, jsonPaths, isArray); - if (isArray) { - ft.SetArray(); - } ft.SetOffset(offset); fieldsByName_.emplace(name, fields_.size()); if (t.Is()) { diff --git a/cpp_src/core/query/query.cc b/cpp_src/core/query/query.cc index e57e1d95d..ce4cf0cab 100644 --- a/cpp_src/core/query/query.cc +++ b/cpp_src/core/query/query.cc @@ -45,7 +45,7 @@ void Query::checkSubQuery() const { void Query::checkSubQueryNoData() const { if rx_unlikely (!aggregations_.empty()) { - throw Error{errQueryExec, "Aggregaton cannot be in subquery with condition Any or Empty"}; + throw Error{errQueryExec, "Aggregation cannot be in subquery with condition Any or Empty"}; } if rx_unlikely (HasLimit() && Limit() != 0) { throw Error{errQueryExec, "Limit cannot be in subquery with condition Any or Empty"}; @@ -175,7 +175,7 @@ void Query::checkSetObjectValue(const Variant& value) const { } } -VariantArray Query::deserializeValues(Serializer& ser, CondType cond) { +VariantArray Query::deserializeValues(Serializer& ser, CondType cond) const { VariantArray values; auto cnt = ser.GetVarUint(); if (cond == CondDWithin) { @@ -198,6 +198,10 @@ VariantArray Query::deserializeValues(Serializer& ser, CondType cond) { return values; } +void Query::deserializeJoinOn(Serializer&) { + throw Error(errLogic, "Unexpected call. JoinOn actual only for JoinQuery"); +} + void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { bool end = false; std::vector> equalPositions; @@ -252,7 +256,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { aggregations_.emplace_back(type, std::move(fields)); auto& ae = aggregations_.back(); while (!ser.Eof() && !aggEnd) { - int atype = ser.GetVarUint(); + auto atype = ser.GetVarUint(); switch (atype) { case QueryAggregationSort: { auto fieldName = ser.GetVString(); @@ -287,7 +291,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { if (sortingEntry.expression.length()) { sortingEntries_.push_back(std::move(sortingEntry)); } - int cnt = ser.GetVarUint(); + auto cnt = ser.GetVarUint(); if (cnt != 0 && sortingEntries_.size() != 1) { throw Error(errParams, "Forced sort order is allowed for the first sorting entry only"); } @@ -298,12 +302,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { break; } case QueryJoinOn: { - const OpType op = static_cast(ser.GetVarUint()); - const CondType condition = static_cast(ser.GetVarUint()); - std::string leftFieldName{ser.GetVString()}; - std::string rightFieldName{ser.GetVString()}; - reinterpret_cast(this)->joinEntries_.emplace_back(op, condition, std::move(leftFieldName), - std::move(rightFieldName)); + deserializeJoinOn(ser); break; } case QueryDebugLevel: @@ -350,7 +349,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { VariantArray val; std::string field(ser.GetVString()); bool isArray = ser.GetVarUint(); - int numValues = ser.GetVarUint(); + auto numValues = ser.GetVarUint(); bool hasExpressions = false; while (numValues--) { hasExpressions = ser.GetVarUint(); @@ -362,7 +361,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { case QueryUpdateField: { VariantArray val; std::string field(ser.GetVString()); - int numValues = ser.GetVarUint(); + auto numValues = ser.GetVarUint(); bool isArray = numValues > 1; bool hasExpressions = false; while (numValues--) { @@ -376,7 +375,7 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { VariantArray val; std::string field(ser.GetVString()); bool hasExpressions = false; - int numValues = ser.GetVarUint(); + auto numValues = ser.GetVarUint(); val.MarkArray(ser.GetVarUint() == 1); while (numValues--) { hasExpressions = ser.GetVarUint(); @@ -428,7 +427,10 @@ void Query::deserialize(Serializer& ser, bool& hasJoinConditions) { entries_.Get(eqPos.first - 1).equalPositions.emplace_back(std::move(eqPos.second)); } } - return; +} + +void Query::serializeJoinEntries(WrSerializer&) const { + throw Error(errLogic, "Unexpected call. JoinEntries actual only for JoinQuery"); } void Query::Serialize(WrSerializer& ser, uint8_t mode) const { @@ -473,13 +475,7 @@ void Query::Serialize(WrSerializer& ser, uint8_t mode) const { } if (mode & WithJoinEntries) { - for (const auto& qje : reinterpret_cast(this)->joinEntries_) { - ser.PutVarUint(QueryJoinOn); - ser.PutVarUint(qje.Operation()); - ser.PutVarUint(qje.Condition()); - ser.PutVString(qje.LeftFieldName()); - ser.PutVString(qje.RightFieldName()); - } + serializeJoinEntries(ser); } for (const auto& equalPoses : entries_.equalPositions) { @@ -747,4 +743,22 @@ bool Query::IsWALQuery() const noexcept { return false; } +void JoinedQuery::deserializeJoinOn(Serializer& ser) { + const OpType op = static_cast(ser.GetVarUint()); + const CondType condition = static_cast(ser.GetVarUint()); + std::string leftFieldName{ser.GetVString()}; + std::string rightFieldName{ser.GetVString()}; + joinEntries_.emplace_back(op, condition, std::move(leftFieldName), std::move(rightFieldName)); +} + +void JoinedQuery::serializeJoinEntries(WrSerializer& ser) const { + for (const auto& qje : joinEntries_) { + ser.PutVarUint(QueryJoinOn); + ser.PutVarUint(qje.Operation()); + ser.PutVarUint(qje.Condition()); + ser.PutVString(qje.LeftFieldName()); + ser.PutVString(qje.RightFieldName()); + } +} + } // namespace reindexer diff --git a/cpp_src/core/query/query.h b/cpp_src/core/query/query.h index 7e8f9bbd6..77945cbd4 100644 --- a/cpp_src/core/query/query.h +++ b/cpp_src/core/query/query.h @@ -34,6 +34,12 @@ class Query { : namespace_(std::forward(nsName)), start_(start), count_(count), calcTotal_(calcTotal) {} Query() = default; + virtual ~Query() = default; + + Query(Query&& other) noexcept = default; + Query& operator=(Query&& other) noexcept = default; + Query(const Query& other) = default; + Query& operator=(const Query& other) = delete; /// Allows to compare 2 Query objects. [[nodiscard]] bool operator==(const Query&) const; @@ -970,8 +976,10 @@ class Query { using OnHelperR = OnHelperTempl; void checkSetObjectValue(const Variant& value) const; + virtual void deserializeJoinOn(Serializer& ser); void deserialize(Serializer& ser, bool& hasJoinConditions); - VariantArray deserializeValues(Serializer&, CondType); + VariantArray deserializeValues(Serializer&, CondType) const; + virtual void serializeJoinEntries(WrSerializer& ser) const; void checkSubQueryNoData() const; void checkSubQueryWithData() const; void checkSubQuery() const; @@ -996,7 +1004,7 @@ class Query { OpType nextOp_ = OpAnd; /// Next operation constant. }; -class JoinedQuery : public Query { +class JoinedQuery final : public Query { public: JoinedQuery(JoinType jt, const Query& q) : Query(q), joinType{jt} {} JoinedQuery(JoinType jt, Query&& q) : Query(std::move(q)), joinType{jt} {} @@ -1005,6 +1013,10 @@ class JoinedQuery : public Query { JoinType joinType{JoinType::LeftJoin}; /// Default join type. h_vector joinEntries_; /// Condition for join. Filled in each subqueries, empty in root query + +private: + void deserializeJoinOn(Serializer& ser); + void serializeJoinEntries(WrSerializer& ser) const; }; template diff --git a/cpp_src/estl/suffix_map.h b/cpp_src/estl/suffix_map.h index d303b6b4e..c70513083 100644 --- a/cpp_src/estl/suffix_map.h +++ b/cpp_src/estl/suffix_map.h @@ -35,7 +35,7 @@ class suffix_map { return value_type(std::make_pair(p, m_->mapped_[m_->sa_[idx_]])); } - const value_type operator->() const { + value_type operator->() const { auto* p = &m_->text_[m_->sa_[idx_]]; return value_type(std::make_pair(p, m_->mapped_[m_->sa_[idx_]])); } @@ -136,25 +136,21 @@ class suffix_map { return end(); } - int insert(std::string_view word, const V& val, int virtual_len = -1) { - if (virtual_len == -1) { - virtual_len = word.length(); - } + int insert(std::string_view word, const V& val) { int wpos = text_.size(); size_t real_len = word.length(); text_.insert(text_.end(), word.begin(), word.end()); text_.emplace_back('\0'); mapped_.insert(mapped_.end(), real_len + 1, val); words_.emplace_back(wpos); - words_len_.emplace_back(real_len, virtual_len); + words_len_.emplace_back(real_len); built_ = false; return wpos; } const CharT* word_at(int idx) const noexcept { return &text_[words_[idx]]; } - int16_t word_len_at(int idx) const noexcept { return words_len_[idx].first; } - int16_t virtual_word_len(int idx) const noexcept { return words_len_[idx].second; } + int16_t word_len_at(int idx) const noexcept { return words_len_[idx]; } void build() { if (built_) { @@ -220,7 +216,7 @@ class suffix_map { std::vector sa_, words_; std::vector lcp_; - std::vector> words_len_; + std::vector words_len_; std::vector mapped_; std::vector text_; bool built_ = false; diff --git a/cpp_src/gtests/tests/API/base_tests.cc b/cpp_src/gtests/tests/API/base_tests.cc index 7ea486928..1582ee68a 100644 --- a/cpp_src/gtests/tests/API/base_tests.cc +++ b/cpp_src/gtests/tests/API/base_tests.cc @@ -2224,3 +2224,52 @@ TEST_F(ReindexerApi, QueryResultsLSNTest) { ASSERT_EQ(lsn, lsns[i]) << i; } } + +TEST_F(ReindexerApi, SelectNull) { + rt.OpenNamespace(default_namespace, StorageOpts().Enabled(false)); + rt.AddIndex(default_namespace, {"id", "hash", "int", IndexOpts().PK()}); + rt.AddIndex(default_namespace, {"value", "tree", "string", IndexOpts()}); + rt.AddIndex(default_namespace, {"store", "-", "string", IndexOpts()}); + rt.AddIndex(default_namespace, {"store_num", "-", "string", IndexOpts().Sparse()}); + rt.UpsertJSON(default_namespace, R"_({"id":1234, "value" : "value", "store": "store", "store_num": 10, "not_indexed": null})_"); + + for (unsigned i = 0; i < 3; ++i) { + QueryResults qr; + auto err = rt.reindexer->Select(Query(default_namespace).Not().Where("id", CondEq, Variant()), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.code(), errParams) << err.what(); + EXPECT_EQ(err.what(), "Can not use 'null'-value with operators '=' and 'IN()' (index: 'id'). Use 'IS NULL'/'IS NOT NULL' instead"); + + qr.Clear(); + err = rt.reindexer->Select(Query(default_namespace).Where("id", CondSet, VariantArray{Variant(1234), Variant()}), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.code(), errParams) << err.what(); + EXPECT_EQ(err.what(), "Can not use 'null'-value with operators '=' and 'IN()' (index: 'id'). Use 'IS NULL'/'IS NOT NULL' instead"); + + qr.Clear(); + err = rt.reindexer->Select(Query(default_namespace).Where("value", CondLt, Variant()), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.code(), errParams) << err.what(); + EXPECT_EQ(err.what(), "Can not use 'null'-value with operators '>','<','<=','>=' and 'RANGE()' (index: 'value')"); + + qr.Clear(); + err = rt.reindexer->Select(Query(default_namespace).Where("store", CondEq, Variant()), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.code(), errParams) << err.what(); + EXPECT_EQ(err.what(), "Can not use 'null'-value directly with 'CondEq' condition in comparator"); + + qr.Clear(); + err = rt.reindexer->Select(Query(default_namespace).Where("store_num", CondSet, Variant()), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.code(), errParams) << err.what(); + EXPECT_EQ(err.what(), "Can not use 'null'-value directly with 'CondEq' condition in comparator"); + + qr.Clear(); + err = rt.reindexer->Select(Query(default_namespace).Where("not_indexed", CondSet, VariantArray{Variant(1234), Variant()}), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.code(), errParams) << err.what(); + EXPECT_EQ(err.what(), "Can not use 'null'-value directly with 'CondSet' condition in comparator"); + + AwaitIndexOptimization(default_namespace); + } +} diff --git a/cpp_src/gtests/tests/unit/ft/ft_generic.cc b/cpp_src/gtests/tests/unit/ft/ft_generic.cc index 6672b748c..069208e6b 100644 --- a/cpp_src/gtests/tests/unit/ft/ft_generic.cc +++ b/cpp_src/gtests/tests/unit/ft/ft_generic.cc @@ -835,9 +835,36 @@ TEST_P(FTGenericApi, SelectWithSeveralGroup) { TEST_P(FTGenericApi, NumberToWordsSelect) { Init(GetDefaultConfig()); - Add("оценка 5 майкл джордан 23"sv, ""sv); - - CheckAllPermutations("", {"пять", "+двадцать", "+три"}, "", {{"оценка !5! майкл джордан !23!", ""}}); + auto row1 = Add("оценка 52 майкл джордан 23 пятьдесят"sv); + auto row2 = Add("8"sv); + auto row3 = Add("41 цифра и еще цифра 241"sv); + auto row4 = Add("начало 120 цифра и еще цифра 9120 конец"sv); + auto row5 = Add("слово один пять два 5 семь 7 ещё пять слово"sv); + auto row6 = Add("слово один 5 два пять семь 7 ещё 5 слово"sv); + auto row7 = Add("1000000000000 1000000000 50000000055 1000000"sv); + auto row8 = Add("70 1 7 77 377 70 7"sv); + + auto select = [this](int id, const std::string& ftQuery, const std::string& result) { + auto q{reindexer::Query("nm1").Where("ft3", CondEq, std::string(ftQuery)).And().Where("id", CondEq, id).WithRank()}; + reindexer::QueryResults res; + q.AddFunction("ft3 = highlight(!,!)"); + auto err = rt.reindexer->Select(q, res); + EXPECT_TRUE(err.ok()) << err.what(); + ASSERT_EQ(res.Count(), 1); + auto item = res.begin().GetItem(); + std::string val = item["ft1"].As(); + ASSERT_EQ(val, result); + }; + select(row1.second, "52 +двадцать +три", "оценка !52! майкл джордан !23! пятьдесят"); + select(row2.second, "восемь", "!8!"); + select(row3.second, "сорок", "!41! цифра и еще цифра !241!"); + select(row3.second, "один", "!41! цифра и еще цифра !241!"); + select(row4.second, "сто конец", "начало !120! цифра и еще цифра !9120 конец!"); + select(row4.second, "тысяч", "начало 120 цифра и еще цифра !9120! конец"); + select(row5.second, "пять", "слово один !пять! два !5! семь 7 ещё !пять! слово"); + select(row6.second, "пять", "слово один !5! два !пять! семь 7 ещё !5! слово"); + select(row7.second, "миллиардов", "1000000000000 !1000000000 50000000055! 1000000"); + select(row8.second, "\"=семьдесят =семь\"", "70 1 7 !77 377 70 7!"); } // Make sure FT seeks by a huge number set by string in DSL @@ -853,6 +880,9 @@ TEST_P(FTGenericApi, HugeNumberToWordsSelect) { "+четыреста +сорок"); // Make sure it found this only string ASSERT_TRUE(qr.Count() == 1); + auto item = qr.begin().GetItem(); + std::string json = item["ft1"].As(); + ASSERT_EQ(json, "много !7343121521906522180408440! денег"); } // Make sure way too huge numbers are ignored in FT @@ -1893,7 +1923,7 @@ TEST_P(FTGenericApi, FrisoTest) { TEST_P(FTGenericApi, FrisoTestSelect) { reindexer::FtFastConfig cfg = GetDefaultConfig(); cfg.stopWords = {}; - cfg.splitterType = reindexer::FtFastConfig::Splitter::Friso; + cfg.splitterType = reindexer::FtFastConfig::Splitter::MMSegCN; Init(cfg); std::unordered_map> index; @@ -1948,7 +1978,7 @@ TEST_P(FTGenericApi, FrisoTestSelect) { TEST_P(FTGenericApi, FrisoTextPostprocess) { reindexer::FtFastConfig cfg = GetDefaultConfig(); - cfg.splitterType = reindexer::FtFastConfig::Splitter::Friso; + cfg.splitterType = reindexer::FtFastConfig::Splitter::MMSegCN; cfg.stopWords = {}; cfg.maxAreasInDoc = 10; Init(cfg); diff --git a/cpp_src/gtests/tests/unit/rpcclient_test.cc b/cpp_src/gtests/tests/unit/rpcclient_test.cc index 31d8b765f..a8c5b767e 100644 --- a/cpp_src/gtests/tests/unit/rpcclient_test.cc +++ b/cpp_src/gtests/tests/unit/rpcclient_test.cc @@ -1384,8 +1384,20 @@ TEST_F(RPCClientTestApi, QuerySetObjectUpdate) { insertFn(kNsName, kNsSize); + client::CoroQueryResults qr; + { + err = rx.Update(Query(kNsName).Where("id", CondGe, "0").SetObject("nested", Variant(std::string(R"([{"field": 1240}])"))), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.what(), "Error modifying field value: 'Unsupported JSON format. Unnamed field detected'"); + } + + { + err = rx.Update(Query(kNsName).Where("id", CondGe, "0").SetObject("nested", Variant(std::string(R"({{"field": 1240}})"))), qr); + ASSERT_FALSE(err.ok()); + EXPECT_EQ(err.what(), "Error modifying field value: 'JSONDecoder: Error parsing json: unquoted key, pos 15'"); + } + { - client::CoroQueryResults qr; // R"(UPDATE TestQuerySetObjectUpdate SET nested = {"field": 1240} where id >= 0)" auto query = Query(kNsName).Where("id", CondGe, "0") .SetObject("nested", Variant(std::string(R"({"field": 1240})"))); diff --git a/cpp_src/gtests/tests/unit/string_function_test.cc b/cpp_src/gtests/tests/unit/string_function_test.cc index 62ad2cea4..ed60eb25a 100644 --- a/cpp_src/gtests/tests/unit/string_function_test.cc +++ b/cpp_src/gtests/tests/unit/string_function_test.cc @@ -7,6 +7,7 @@ #include #endif // REINDEX_WITH_ASAN +#include "core/ft/numtotext.h" #include "gtest/gtest.h" #include "reindexer_api.h" #include "tools/customlocal.h" @@ -170,3 +171,63 @@ TEST_F(ReindexerApi, LikeWithFullTextIndex) { err = rt.reindexer->Select(Query(default_namespace).Where("name", CondLike, "%" + content[rand() % content.size()]), qr); ASSERT_TRUE(!err.ok()); } + +TEST_F(ReindexerApi, NumToText) { + auto out = [](const std::vector& resNum) { + std::stringstream s; + for (auto& v : resNum) { + s << "[" << v << "] "; + } + s << std::endl; + return s.str(); + }; + std::vector resNum; + bool r = reindexer::NumToText::convert("0", resNum) == std::vector{"ноль"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("00", resNum) == std::vector{"ноль", "ноль"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("000010", resNum) == std::vector{"ноль", "ноль", "ноль", "ноль", "десять"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("01000000", resNum) == std::vector{"ноль", "один", "миллион"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("121", resNum) == std::vector{"сто", "двадцать", "один"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("1", resNum) == std::vector{"один"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("9", resNum) == std::vector{"девять"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("10", resNum) == std::vector{"десять"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("13", resNum) == std::vector{"тринадцать"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("30", resNum) == std::vector{"тридцать"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("48", resNum) == std::vector{"сорок", "восемь"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("100", resNum) == std::vector{"сто"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("500", resNum) == std::vector{"пятьсот"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("999", resNum) == std::vector{"девятьсот", "девяносто", "девять"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("1000", resNum) == std::vector{"одна", "тысяча"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("1001", resNum) == std::vector{"одна", "тысяча", "один"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("5111", resNum) == std::vector{"пять", "тысяч", "сто", "одиннадцать"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("777101", resNum) == + std::vector{"семьсот", "семьдесят", "семь", "тысяч", "сто", "один"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("1000000000", resNum) == std::vector{"один", "миллиард"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("1005000000", resNum) == std::vector{"один", "миллиард", "пять", "миллионов"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("50000000055", resNum) == + std::vector{"пятьдесят", "миллиардов", "пятьдесят", "пять"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("100000000000000000000000000", resNum) == std::vector{"сто", "септиллионов"}; + ASSERT_TRUE(r) << out(resNum); + r = reindexer::NumToText::convert("1000000000000000000000000000", resNum) == std::vector{}; + ASSERT_TRUE(r) << out(resNum); +} diff --git a/cpp_src/replicator/replicator.cc b/cpp_src/replicator/replicator.cc index e7f478b1e..905cde0bb 100644 --- a/cpp_src/replicator/replicator.cc +++ b/cpp_src/replicator/replicator.cc @@ -609,7 +609,7 @@ Error Replicator::syncNamespaceForced(const NamespaceDef& ns, std::string_view r err = syncMetaForced(tmpNs, ns.name); if (err.ok()) { - err = syncSchemaForced(tmpNs, ns.name); + err = syncSchemaForced(tmpNs, NamespaceDef(ns.name)); } // Make query to complete master's namespace data diff --git a/cpp_src/server/CMakeLists.txt b/cpp_src/server/CMakeLists.txt index d297bcc5b..d7ba57e3e 100644 --- a/cpp_src/server/CMakeLists.txt +++ b/cpp_src/server/CMakeLists.txt @@ -4,7 +4,7 @@ cmake_minimum_required(VERSION 3.10) project(reindexer_server_library) set (SWAGGER_VERSION "2.x") -set (GH_FACE_VERSION "3.30.0") +set (GH_FACE_VERSION "3.31.0") set (GH_FACE_TAG "v${GH_FACE_VERSION}") set (TARGET reindexer_server_library) set (SERVER_LIB_DIR ${PROJECT_BINARY_DIR} PARENT_SCOPE) diff --git a/cpp_src/server/contrib/server.md b/cpp_src/server/contrib/server.md index 28d7951b8..a89768979 100644 --- a/cpp_src/server/contrib/server.md +++ b/cpp_src/server/contrib/server.md @@ -2364,7 +2364,7 @@ Fulltext Index configuration |**partial_match_decrease**
*optional*|Decrease of relevancy in case of partial match by value: partial_match_decrease * (non matched symbols) / (matched symbols)
**Minimum value** : `0`
**Maximum value** : `100`|integer| |**position_boost**
*optional*|Boost of search query term position
**Default** : `1.0`
**Minimum value** : `0`
**Maximum value** : `10`|number (float)| |**position_weight**
*optional*|Weight of search query term position in final rank. 0: term position will not change final rank. 1: term position will affect to final rank in 0 - 100% range
**Default** : `0.1`
**Minimum value** : `0`
**Maximum value** : `1`|number (float)| -|**splitter**
*optional*|Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'friso' - algorithm based on mmseg for Chinese and English
**Default** : `"fast"`|enum (fast, friso)| +|**splitter**
*optional*|Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'mmseg_cn' - algorithm based on friso implementation of mmseg for Chinese and English"
**Default** : `"fast"`|enum (fast, mmseg_cn)| |**stemmers**
*optional*|List of stemmers to use|< string > array| |**stop_words**
*optional*|List of objects of stop words. Words from this list will be ignored when building indexes|< [FtStopWordObject](#ftstopwordobject) > array| |**sum_ranks_by_fields_ratio**
*optional*|Ratio to summation of ranks of match one term in several fields. For example, if value of this ratio is K, request is '@+f1,+f2,+f3 word', ranks of match in fields are R1, R2, R3 and R2 < R1 < R3, final rank will be R = R2 + K*R1 + K*K*R3
**Default** : `0.0`
**Minimum value** : `0`
**Maximum value** : `1`|number (float)| diff --git a/cpp_src/server/contrib/server.yml b/cpp_src/server/contrib/server.yml index 7e04c6f7a..a2e82cf6b 100644 --- a/cpp_src/server/contrib/server.yml +++ b/cpp_src/server/contrib/server.yml @@ -3124,10 +3124,10 @@ definitions: maximum: 500 splitter: type: string - description: "Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'friso' - algorithm based on mmseg for Chinese and English" + description: "Text tokenization algorithm. 'fast' - splits text by spaces, special characters and unsupported UTF-8 symbols. Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. 'mmseg_cn' - algorithm based on friso implementation of mmseg for Chinese and English" enum: - "fast" - - "friso" + - "mmseg_cn" default: "fast" FulltextFieldConfig: diff --git a/ftfastconfig.go b/ftfastconfig.go index f2e3ab9be..e97a0f800 100644 --- a/ftfastconfig.go +++ b/ftfastconfig.go @@ -191,9 +191,9 @@ type FtFastConfig struct { // Config for document ranking Bm25Config *Bm25ConfigType `json:"bm25_config,omitempty"` // Text tokenization algorithm. Default 'fast'. - // 'fast' : splits text by spaces, special characters and unsupported UTF-8 symbols. - // Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. - // 'friso': algorithm based on mmseg for Chinese and English + // 'fast' : splits text by spaces, special characters and unsupported UTF-8 symbols. + // Each token is a combination of letters from supported UTF-8 subset, numbers and extra word symbols. + // 'mmseg_cn': algorithm based on friso mmseg for Chinese and English SplitterType string `json:"splitter,omitempty"` } diff --git a/fulltext.md b/fulltext.md index c5258e823..d8065fd97 100644 --- a/fulltext.md +++ b/fulltext.md @@ -385,7 +385,7 @@ Several parameters of full text search engine can be configured from application | | Optimization | string | Optimize the index by 'memory' or by 'cpu' | "memory" | | | FtBaseRanking | struct | Relevance of the word in different forms | | | | Bm25Config | struct | Document ranking function parameters [More...](#basic-document-ranking-algorithms) | | -| | SplitterType | string | Text breakdown algorithm. Available values: 'friso' and 'fast' | "fast" | +| | SplitterType | string | Text breakdown algorithm. Available values: 'mmseg_cn' and 'fast' | "fast" | ### Stopwords details The list item can be either a string or a structure containing a string (the stopword) and a bool attribute (`is_morpheme`) indicating whether the stopword can be part of a word that can be shown in query-results. @@ -451,7 +451,7 @@ FtBaseRanking: config for the base relevancy of the word in different forms. ### Text splitters -Reindexer supports two algorithms to break texts into words: `fast` and `friso`. +Reindexer supports two algorithms to break texts into words: `fast` and `mmseg_cn`. Default `fast` algorithm is based on the definition of a word in the form of a alpha (from supported Unicode subset), number and an extended character, everything else (whitespaces, special characters, unsopported Unicode subsets, etc) will be threated as a delimiters. @@ -487,7 +487,7 @@ Reindexer supports the following unicode block codes / extra symbols: This algorithm is simple and provides high performance, but it can not handle texts without delimiters (for example, Chinese language does not requires whitespaces between words, so `fast`-splitter will not be able to index it properly). -Alternative `friso`-splitter is based on [friso](https://github.com/lionsoul2014/friso) implementation of `mmseg` algorithm and uses dictionaries for tokenization. Currently this splitter supports only Chinese and English languages. +Alternative `mmseg_cn`-splitter is based on [friso](https://github.com/lionsoul2014/friso) implementation of `mmseg` algorithm and uses dictionaries for tokenization. Currently this splitter supports only Chinese and English languages. ### Basic document ranking algorithms diff --git a/reflect.go b/reflect.go index e23197a69..36b957864 100644 --- a/reflect.go +++ b/reflect.go @@ -40,9 +40,13 @@ type indexOptions struct { } func parseRxTags(field reflect.StructField) (idxName string, idxType string, expireAfter string, idxSettings []string) { - tagsSlice := strings.SplitN(field.Tag.Get("reindex"), ",", 3) + tag, isSet := field.Tag.Lookup("reindex") + tagsSlice := strings.SplitN(tag, ",", 3) var idxOpts string idxName, idxType, expireAfter, idxOpts = tagsSlice[0], "", "", "" + if isSet && len(idxName) == 0 && !field.Anonymous && field.Name != "_" { + idxName = field.Name + } if len(tagsSlice) > 1 { idxType = tagsSlice[1] @@ -140,14 +144,14 @@ func parseIndexesImpl(indexDefs *[]bindings.IndexDef, st reflect.Type, subArray } if jsonTag == "-" && !opts.isComposite && !opts.isJoined { - if reindexTag := field.Tag.Get("reindex"); reindexTag != "" { - return fmt.Errorf("non-composite/non-joined field ('%s'), marked with `json:-` can not have explicit reindex tags, but it does ('%s')", field.Name, reindexTag) + if reindexTag, isSet := field.Tag.Lookup("reindex"); isSet { + return fmt.Errorf("non-composite/non-joined field ('%s'), marked with `json:-` can not have explicit reindex tags, but it does (reindex:\"%s\")", field.Name, reindexTag) } continue } if !opts.isComposite && !field.IsExported() { - if reindexTag := field.Tag.Get("reindex"); reindexTag != "" { - return fmt.Errorf("unexported non-composite field ('%s') can not have reindex tags, but it does ('%s')", field.Name, reindexTag) + if reindexTag, isSet := field.Tag.Lookup("reindex"); isSet { + return fmt.Errorf("unexported non-composite field ('%s') can not have reindex tags, but it does (reindex:\"%s\")", field.Name, reindexTag) } continue } diff --git a/test/compatibility_test/compatibility_test.sh b/test/compatibility_test/compatibility_test.sh index 23d5edbe2..4cf7fd816 100755 --- a/test/compatibility_test/compatibility_test.sh +++ b/test/compatibility_test/compatibility_test.sh @@ -39,19 +39,19 @@ test_outdated_instance() { echo "====Master: ${master_cmd}" echo "====Slave: ${slave_cmd}" init_storages - ${master_cmd} --db "${master_db_path}" -l0 --serverlog=\"reindexer_master_$3.1.log\" --corelog=\"reindexer_master_$3.1.log\" --httplog=\"\" --rpclog=\"\" & + ${master_cmd} --db "${master_db_path}" -l trace --serverlog=reindexer_master_$3.1.log --corelog=reindexer_master_$3.1.log --httplog=none --rpclog=none & master_pid=$! sleep 8 go run ${script_dir}/filler.go --dsn "${master_dsn}/${db_name}" --offset 0 echo "====Force sync" - ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l0 --serverlog=\"reindexer_slave_$3.1.log\" --corelog=\"reindexer_slave_$3.1.log\" --httplog=\"\" --rpclog=\"\" & + ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l trace --serverlog=reindexer_slave_$3.1.log --corelog=reindexer_slave_$3.1.log --httplog=none --rpclog=none & slave_pid=$! sleep 8 kill $slave_pid - wait $slave_pid + wait $slave_pid go run ${script_dir}/filler.go --dsn "${master_dsn}/${db_name}" --offset 100 echo "====Sync by WAL" - ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l0 --serverlog=\"reindexer_slave_$3.2.log\" --corelog=\"reindexer_slave_$3.2.log\" --httplog=\"\" --rpclog=\"\" & + ${slave_cmd} --db "${slave_db_path}" -p 9089 -r 6535 -l trace --serverlog=reindexer_slave_$3.2.log --corelog=reindexer_slave_$3.2.log --httplog=none --rpclog=none & slave_pid=$! sleep 12 echo "====Online sync" @@ -60,11 +60,10 @@ test_outdated_instance() { build/cpp_src/cmd/reindexer_tool/reindexer_tool --dsn "${master_dsn}/${db_name}" --command "\dump ${ns_name}" --output "${master_dump}" build/cpp_src/cmd/reindexer_tool/reindexer_tool --dsn "${slave_dsn}/${db_name}" --command "\dump ${ns_name}" --output "${slave_dump}" kill $slave_pid - wait $slave_pid - kill $master_pid - wait $master_pid - sed -i -E "s/(\\NAMESPACES ADD.*)(\"schema\":\"\{.*\}\")/\1\"schema\":\"\{\}\"/" "${master_dump}" - ${script_dir}/compare_dumps.sh "${master_dump}" "${slave_dump}" + wait $slave_pid + kill $master_pid + wait $master_pid + ${script_dir}/compare_dumps.sh "${master_dump}" "${slave_dump}" } echo "====Installing reindexer package====" diff --git a/test/encdec_test.go b/test/encdec_test.go index 09f71e274..990133c9d 100644 --- a/test/encdec_test.go +++ b/test/encdec_test.go @@ -9,6 +9,7 @@ import ( "time" "github.com/restream/reindexer/v3" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -46,63 +47,68 @@ type ( type TestItemEncDec struct { ID int `reindex:"id,-"` *TestEmbedItem - Prices []*TestJoinItem `reindex:"prices,,joined"` - Pricesx []*TestJoinItem `reindex:"pricesx,,joined"` - Packages []int `reindex:"packages,hash"` - UPackages []uint `reindex:"upackages,hash"` - UPackages64 []uint64 `reindex:"upackages64,hash"` - FPackages []float32 `reindex:"fpackages,tree"` - FPackages64 []float64 `reindex:"fpackages64,tree"` - Bool bool `reindex:"bool"` - Bools []bool `reindex:"bools"` - Name string `reindex:"name,tree"` - Countries []string `reindex:"countries,tree"` - Description string `reindex:"description,fuzzytext"` - Rate float64 `reindex:"rate,tree"` - CustomStringsPtr TestCustomStringsPtrs `reindex:"custom_strings_ptrs"` - CustomStrings TestCustomStrings `reindex:"custom_strings"` - CustomInts64 TestCustomInts64 `reindex:"custom_ints64"` - CustomInts16 TestCustomInts16 `reindex:"custom_ints16"` - CustomFloats TestCustomFloats `reindex:"custom_floats"` - IsDeleted bool `reindex:"isdeleted,-"` - PNested *TestNest `reindex:"-"` - Nested TestNest - NestedA [1]TestNest `reindex:"-"` - NonIndexArr []int - NonIndexA [20]float64 - NonIndexPA []*int - Actors []Actor `reindex:"-"` - PricesIDs []int `reindex:"price_id"` - LocationID string `reindex:"location"` - EndTime uint32 `reindex:"end_time,-"` - StartTime uint64 `reindex:"start_time,tree"` - PStrNull *string - PStr *string - Tmp string `reindex:"tmp,-"` - Map1 map[string]int - Map2 map[int64]Actor - Map3 map[int]*Actor - Map4 map[int]*int - Map5 map[int][]int - Map6 map[uint][]uint - Interface interface{} - Interface2 interface{} - InterfaceNull interface{} - MapNull map[string]int - SliceStrNull []string - SliceNull []int - SliceStr []string - SliceUInt []uint - SliceUInt64 []uint64 - NegativeSliceInt64 []int64 - SliceF64 []float64 - SliceF32 []float32 - SliceBool []bool - SliceIface []interface{} - SliceIface1 []interface{} - UInt64 uint64 - UInt32 uint32 - UInt uint + Prices []*TestJoinItem `reindex:"prices,,joined"` + Pricesx []*TestJoinItem `reindex:"pricesx,,joined"` + Packages []int `reindex:"packages,hash"` + UPackages []uint `reindex:"upackages,hash"` + UPackages64 []uint64 `reindex:"upackages64,hash"` + FPackages []float32 `reindex:"fpackages,tree"` + FPackages64 []float64 `reindex:"fpackages64,tree"` + Bool bool `reindex:"bool"` + Bools []bool `reindex:"bools"` + Name string `reindex:"name,tree"` + Countries []string `reindex:"countries,tree"` + Description string `reindex:"description,fuzzytext"` + Rate float64 `reindex:"rate,tree"` + CustomStringsPtr TestCustomStringsPtrs `reindex:"custom_strings_ptrs"` + CustomStrings TestCustomStrings `reindex:"custom_strings"` + CustomInts64 TestCustomInts64 `reindex:"custom_ints64"` + CustomInts16 TestCustomInts16 `reindex:"custom_ints16"` + CustomFloats TestCustomFloats `reindex:"custom_floats"` + IsDeleted bool `reindex:"isdeleted,-"` + EmptyReindexTagStr1 string `reindex:",-"` + EmptyReindexTagStr2 string `reindex:""` + EmptyJsonTagStr string `json:""` + TextLabel1 string `reindex:"TextLabel,text" json:",omitempty"` + TextLabel2 string `reindex:",text" json:",omitempty"` + PNested *TestNest `reindex:"-"` + Nested TestNest + NestedA [1]TestNest `reindex:"-"` + NonIndexArr []int + NonIndexA [20]float64 + NonIndexPA []*int + Actors []Actor `reindex:"-"` + PricesIDs []int `reindex:"price_id"` + LocationID string `reindex:"location"` + EndTime uint32 `reindex:"end_time,-"` + StartTime uint64 `reindex:"start_time,tree"` + PStrNull *string + PStr *string + Tmp string `reindex:"tmp,-"` + Map1 map[string]int + Map2 map[int64]Actor + Map3 map[int]*Actor + Map4 map[int]*int + Map5 map[int][]int + Map6 map[uint][]uint + Interface interface{} + Interface2 interface{} + InterfaceNull interface{} + MapNull map[string]int + SliceStrNull []string + SliceNull []int + SliceStr []string + SliceUInt []uint + SliceUInt64 []uint64 + NegativeSliceInt64 []int64 + SliceF64 []float64 + SliceF32 []float32 + SliceBool []bool + SliceIface []interface{} + SliceIface1 []interface{} + UInt64 uint64 + UInt32 uint32 + UInt uint Custom TestCustomBytes Time time.Time @@ -187,6 +193,11 @@ func FillTestItemsEncDec(start int, count int, pkgsCount int, asJson bool) { Age: rand.Int() % 5, Name: randString(), }, + EmptyReindexTagStr1: randString(), + EmptyReindexTagStr2: randString(), + EmptyJsonTagStr: randString(), + TextLabel1: randString(), + TextLabel2: randString(), PNested: &TestNest{ Age: rand.Int() % 5, Name: randString(), @@ -202,24 +213,24 @@ func FillTestItemsEncDec(start int, count int, pkgsCount int, asJson bool) { `xxxx`: int(rand.Int31()), }, Map2: map[int64]Actor{ - 1: Actor{randString()}, - 100: Actor{randString()}, + 1: {randString()}, + 100: {randString()}, }, Map3: map[int]*Actor{ - 4: &Actor{randString()}, - 2: &Actor{randString()}, + 4: {randString()}, + 2: {randString()}, }, Map4: map[int]*int{ 5: vint1, 120: vint2, }, Map5: map[int][]int{ - 0: []int{1, 2, 3}, - -1: []int{9, 8, 7}, + 0: {1, 2, 3}, + -1: {9, 8, 7}, }, Map6: map[uint][]uint{ - 0: []uint{1, 2, 3}, - 4: []uint{9, 8, 7}, + 0: {1, 2, 3}, + 4: {9, 8, 7}, }, NonIndexPA: []*int{ vint1, @@ -293,15 +304,32 @@ func TestHeterogeneusArrayEncDec(t *testing.T) { defer it.Close() require.NoError(t, it.Error()) - items := make([]interface{}, 0, 1) for it.Next() { item := &TestItemEncDec{} err := json.Unmarshal(it.JSON(), &item) require.NoError(t, err, "error json was: %s\n", it.JSON()) - items = append(items, item) } } +func checkIndexesWithEmptyTags(t *testing.T) { + expectedIndexes := map[string]string{ + "EmptyReindexTagStr1": "-", + "EmptyReindexTagStr2": "hash", + "TextLabel": "text", + "TextLabel2": "text", + } + + desc, err := DB.DescribeNamespace("test_items_encdec") + require.NoError(t, err) + for _, index := range desc.Indexes { + if typ, ok := expectedIndexes[index.Name]; ok { + assert.Equal(t, typ, index.IndexType) + delete(expectedIndexes, index.Name) + } + } + assert.Empty(t, expectedIndexes, "Some of the indexes are missing") +} + func TestEncDec(t *testing.T) { t.Parallel() // Fill items by cjson encoder @@ -310,6 +338,8 @@ func TestEncDec(t *testing.T) { // fill items in json format FillTestItemsEncDec(5000, 10000, 20, true) + checkIndexesWithEmptyTags(t) + // get and decode all items by cjson decoder newTestQuery(DB, "test_items_encdec").ExecAndVerify(t) diff --git a/test/index_struct_test.go b/test/index_struct_test.go index ffd37db08..54336ab19 100644 --- a/test/index_struct_test.go +++ b/test/index_struct_test.go @@ -120,15 +120,15 @@ func TestOpenNs(t *testing.T) { err := DB.OpenNamespace(ns, reindexer.DefaultNamespaceOptions(), FailSimple{}) assert.ErrorContains(t, err, - "non-composite/non-joined field ('Age'), marked with `json:-` can not have explicit reindex tags, but it does ('age,hash')") + "non-composite/non-joined field ('Age'), marked with `json:-` can not have explicit reindex tags, but it does (reindex:\"age,hash\")") err = DB.OpenNamespace(ns, reindexer.DefaultNamespaceOptions(), FailPrivate{}) assert.ErrorContains(t, err, - "unexported non-composite field ('private') can not have reindex tags, but it does ('private,hash')") + "unexported non-composite field ('private') can not have reindex tags, but it does (reindex:\"private,hash\")") err = DB.OpenNamespace(ns, reindexer.DefaultNamespaceOptions(), FailPrivateJoin{}) assert.ErrorContains(t, err, - "unexported non-composite field ('privateAccounts') can not have reindex tags, but it does ('accounts,,joined')") + "unexported non-composite field ('privateAccounts') can not have reindex tags, but it does (reindex:\"accounts,,joined\")") err = DB.OpenNamespace(ns, reindexer.DefaultNamespaceOptions(), FailJoinScalar{}) assert.ErrorContains(t, err,