Skip to content

Commit

Permalink
[fea] Add bench for string comparator
Browse files Browse the repository at this point in the history
  • Loading branch information
reindexer-bot committed Jan 15, 2024
1 parent 54027a7 commit 3a31b6f
Show file tree
Hide file tree
Showing 52 changed files with 1,181 additions and 306 deletions.
8 changes: 4 additions & 4 deletions changelog.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Version 3.21.0 (15.12.2023)
## Core
- [fea] Added [subqueries](#subqueries-nested-queries) support (`explain` for subqueries will be implemented in the next releases)
- [fea] Added [subqueries](#subqueries-nested-queries) support (`explain` for subqueries will be implement in the next releases)
- [fea] Added backtraces/minidump support for Windows platform
- [fea] Added query crash tracker support for Windows platform
- [fix] Added explicit error for aggregations in joined queries
Expand All @@ -16,16 +16,16 @@

## Go connector
- [fea] Added Go API and DSL-convertor for subqueries
- [fea] Changed CJSON-to-object conversion logic for slices: now the single JSON values and fields with multiple JSON-paths will be concatenated together in the target field
- [fea] Added `WithStrictJoinHandlers`. This option allows to validate JoinHandlers usage at runtime
- [fea] Changed CJSON-to-object convetrion logic for slices: now the single JSON values and fields with multiple JSON-paths will be concatenated together in the target field
- [fea] Added `WithStrictJoinHandlers`. This option allows to validate JoinHandlers usage in runtime
- [fix] Fixed panic handling in the CJSON deserialization
- [fix] Fixed logging in `cproto`-binding. Error messages will no longer be redirected to stdout instead of user's logger

## Face
- [fea] Saved the scroll position on the sorting
- [fea] Changed the Server ID range
- [fea] Improved the notification about the supported browsers
- [fea] Added the default values to the config form when the default config is used
- [fea] Added the default values to the config form when the default config is using
- [fix] Fixed the wrong redirect to a fake database
- [fix] Fixed the column order changing on the data sorting
- [fix] Fixed the horizontal scroll on the data sorting
Expand Down
2 changes: 1 addition & 1 deletion cpp_src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -731,7 +731,7 @@ if (NOT WIN32)
SET(CMAKE_INSTALL_DEFAULT_COMPONENT_NAME "server")
SET(DIST_INCLUDE_FILES
"tools/errors.h" "tools/serializer.h" "tools/varint.h" "tools/stringstools.h" "tools/customhash.h" "tools/assertrx.h" "tools/jsonstring.h"
"tools/verifying_updater.h"
"tools/verifying_updater.h" "tools/customlocal.h"
"core/reindexer.h" "core/type_consts.h" "core/item.h" "core/payload/payloadvalue.h" "core/payload/payloadiface.h" "core/indexopts.h"
"core/namespacedef.h" "core/keyvalue/variant.h" "core/keyvalue/geometry.h" "core/sortingprioritiestable.h"
"core/rdxcontext.h" "core/activity_context.h" "core/type_consts_helpers.h" "core/payload/fieldsset.h" "core/payload/payloadtype.h"
Expand Down
16 changes: 7 additions & 9 deletions cpp_src/cmd/reindexer_server/contrib/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
FROM alpine:3.14 AS build
FROM alpine:3.19 AS build

RUN cd /tmp && apk update && \
apk add git curl autoconf automake libtool linux-headers g++ make libunwind-dev grpc-dev grpc protobuf-dev c-ares-dev && \
apk add git curl autoconf automake libtool linux-headers g++ make libunwind-dev grpc-dev protobuf-dev c-ares-dev patch && \
git clone https://github.com/gperftools/gperftools.git && \
cd gperftools && \
echo "noinst_PROGRAMS =" >> Makefile.am && \
sed -i s/_sigev_un\._tid/sigev_notify_thread_id/ src/profile-handler.cc && \
./autogen.sh && ./configure --disable-dependency-tracking && make -j8 && make install
./autogen.sh && ./configure --disable-dependency-tracking && make -j8 && make install

ADD . /src

Expand All @@ -19,17 +18,16 @@ RUN ./dependencies.sh && \
make -j8 reindexer_server reindexer_tool && \
make install -C cpp_src/cmd/reindexer_server && \
make install -C cpp_src/cmd/reindexer_tool && \
make install -C cpp_src/server/grpc && \
cp ../cpp_src/cmd/reindexer_server/contrib/entrypoint.sh /entrypoint.sh && \
rm -rf /usr/local/lib/*.a /usr/local/include /usr/local/lib/libtcmalloc_debug* /usr/local/lib/libtcmalloc_minimal* \
/usr/local/lib/libprofiler* /usr/local/lib/libtcmalloc.* /usr/local/share/doc /usr/local/share/man /usr/local/share/perl5 /usr/local/bin/pprof*
/usr/local/lib/libprofiler* /usr/local/lib/libtcmalloc.* /usr/local/share/doc /usr/local/share/man /usr/local/share/perl5 /usr/local/bin/pprof*

RUN cd build && make install -C cpp_src/server/grpc

FROM alpine:3.14
FROM alpine:3.19

COPY --from=build /usr/local /usr/local
COPY --from=build /entrypoint.sh /entrypoint.sh
RUN apk update && apk add libstdc++ libunwind snappy libexecinfo leveldb c-ares libprotobuf xz-libs && rm -rf /var/cache/apk/*
RUN apk update && apk add libstdc++ libunwind snappy leveldb c-ares libprotobuf xz-libs grpc-cpp && rm -rf /var/cache/apk/*

ENV RX_DATABASE /db
ENV RX_CORELOG stdout
Expand Down
195 changes: 195 additions & 0 deletions cpp_src/cmd/reindexer_server/test/test_storage_compatibility.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
#!/bin/bash
# Task: https://github.com/restream/reindexer/-/issues/1188
set -e

function KillAndRemoveServer {
local pid=$1
kill $pid
wait $pid
yum remove -y 'reindexer*' > /dev/null
}

function WaitForDB {
# wait until DB is loaded
set +e # disable "exit on error" so the script won't stop when DB's not loaded yet
is_connected=$(reindexer_tool --dsn $ADDRESS --command '\databases list');
while [[ $is_connected != "test" ]]
do
sleep 2
is_connected=$(reindexer_tool --dsn $ADDRESS --command '\databases list');
done
set -e
}

function CompareNamespacesLists {
local ns_list_actual=$1
local ns_list_expected=$2
local pid=$3

diff=$(echo ${ns_list_actual[@]} ${ns_list_expected[@]} | tr ' ' '\n' | sort | uniq -u) # compare in any order
if [ "$diff" == "" ]; then
echo "## PASS: namespaces list not changed"
else
echo "##### FAIL: namespaces list was changed"
echo "expected: $ns_list_expected"
echo "actual: $ns_list_actual"
KillAndRemoveServer $pid;
exit 1
fi
}

function CompareMemstats {
local actual=$1
local expected=$2
local pid=$3
diff=$(echo ${actual[@]} ${expected[@]} | tr ' ' '\n' | sed 's/\(.*\),$/\1/' | sort | uniq -u) # compare in any order
if [ "$diff" == "" ]; then
echo "## PASS: memstats not changed"
else
echo "##### FAIL: memstats was changed"
echo "expected: $expected"
echo "actual: $actual"
KillAndRemoveServer $pid;
exit 1
fi
}


RX_SERVER_CURRENT_VERSION_RPM="$(basename build/reindexer-*server*.rpm)"
VERSION_FROM_RPM=$(echo "$RX_SERVER_CURRENT_VERSION_RPM" | grep -o '.*server-..')
VERSION=$(echo ${VERSION_FROM_RPM: -2:1}) # one-digit version

echo "## choose latest release rpm file"
if [ $VERSION == 3 ]; then
LATEST_RELEASE=$(python3 cpp_src/cmd/reindexer_server/test/get_last_rx_version.py -v 3)
namespaces_list_expected=$'purchase_options_ext_dict\nchild_account_recommendations\n#config\n#activitystats\nradio_channels\ncollections\n#namespaces\nwp_imports_tasks\nepg_genres\nrecom_media_items_personal\nrecom_epg_archive_default\n#perfstats\nrecom_epg_live_default\nmedia_view_templates\nasset_video_servers\nwp_tasks_schedule\nadmin_roles\n#clientsstats\nrecom_epg_archive_personal\nrecom_media_items_similars\nmenu_items\naccount_recommendations\nkaraoke_items\nmedia_items\nbanners\n#queriesperfstats\nrecom_media_items_default\nrecom_epg_live_personal\nservices\n#memstats\nchannels\nmedia_item_recommendations\nwp_tasks_tasks\nepg'
elif [ $VERSION == 4 ]; then
LATEST_RELEASE=$(python3 cpp_src/cmd/reindexer_server/test/get_last_rx_version.py -v 4)
# replicationstats ns added for v4
namespaces_list_expected=$'purchase_options_ext_dict\nchild_account_recommendations\n#config\n#activitystats\n#replicationstats\nradio_channels\ncollections\n#namespaces\nwp_imports_tasks\nepg_genres\nrecom_media_items_personal\nrecom_epg_archive_default\n#perfstats\nrecom_epg_live_default\nmedia_view_templates\nasset_video_servers\nwp_tasks_schedule\nadmin_roles\n#clientsstats\nrecom_epg_archive_personal\nrecom_media_items_similars\nmenu_items\naccount_recommendations\nkaraoke_items\nmedia_items\nbanners\n#queriesperfstats\nrecom_media_items_default\nrecom_epg_live_personal\nservices\n#memstats\nchannels\nmedia_item_recommendations\nwp_tasks_tasks\nepg'
else
echo "Unknown version"
exit 1
fi

echo "## downloading latest release rpm file: $LATEST_RELEASE"
curl "http://repo.itv.restr.im/itv-api-ng/7/x86_64/$LATEST_RELEASE" --output $LATEST_RELEASE;
echo "## downloading example DB"
curl "https://git.restream.ru/MaksimKravchuk/reindexer_testdata/-/raw/master/big.zip" --output big.zip;
unzip -o big.zip # unzips into mydb_big.rxdump;

ADDRESS="cproto://127.0.0.1:6534/"
DB_NAME="test"

memstats_expected=$'[
{"replication":{"data_hash":24651210926,"data_count":3}},
{"replication":{"data_hash":6252344969,"data_count":1}},
{"replication":{"data_hash":37734732881,"data_count":28}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":1024095024522,"data_count":1145}},
{"replication":{"data_hash":8373644068,"data_count":1315}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":7404222244,"data_count":97}},
{"replication":{"data_hash":94132837196,"data_count":4}},
{"replication":{"data_hash":1896088071,"data_count":2}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":-672103903,"data_count":33538}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":6833710705,"data_count":1}},
{"replication":{"data_hash":5858155773472,"data_count":4500}},
{"replication":{"data_hash":-473221280268823592,"data_count":65448}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":8288213744,"data_count":3}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":0,"data_count":0}},
{"replication":{"data_hash":354171024786967,"data_count":3941}},
{"replication":{"data_hash":-6520334670,"data_count":35886}},
{"replication":{"data_hash":112772074632,"data_count":281}},
{"replication":{"data_hash":-12679568198538,"data_count":1623116}}
]
Returned 27 rows'

echo "##### Forward compatibility test #####"

DB_PATH=$(pwd)"/rx_db"

echo "Database: "$DB_PATH

echo "## installing latest release: $LATEST_RELEASE"
yum install -y $LATEST_RELEASE > /dev/null;
# run RX server with disabled logging
reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH &
server_pid=$!
sleep 2;

reindexer_tool --dsn $ADDRESS$DB_NAME -f mydb_big.rxdump --createdb;
sleep 1;

namespaces_1=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list');
echo $namespaces_1;
CompareNamespacesLists "${namespaces_1[@]}" "${namespaces_list_expected[@]}" $server_pid;

memstats_1=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select replication.data_hash, replication.data_count from #memstats');
CompareMemstats "${memstats_1[@]}" "${memstats_expected[@]}" $server_pid;

KillAndRemoveServer $server_pid;

echo "## installing current version: $RX_SERVER_CURRENT_VERSION_RPM"
yum install -y build/*.rpm > /dev/null;
reindexer_server -l0 --corelog=none --httplog=none --rpclog=none --db $DB_PATH &
server_pid=$!
sleep 2;

WaitForDB

namespaces_2=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list');
echo $namespaces_2;
CompareNamespacesLists "${namespaces_2[@]}" "${namespaces_1[@]}" $server_pid;

memstats_2=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select replication.data_hash, replication.data_count from #memstats');
CompareMemstats "${memstats_2[@]}" "${memstats_1[@]}" $server_pid;

KillAndRemoveServer $server_pid;
rm -rf $DB_PATH;
sleep 1;

echo "##### Backward compatibility test #####"

echo "## installing current version: $RX_SERVER_CURRENT_VERSION_RPM"
yum install -y build/*.rpm > /dev/null;
reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH &
server_pid=$!
sleep 2;

reindexer_tool --dsn $ADDRESS$DB_NAME -f mydb_big.rxdump --createdb;
sleep 1;

namespaces_3=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list');
echo $namespaces_3;
CompareNamespacesLists "${namespaces_3[@]}" "${namespaces_list_expected[@]}" $server_pid;

memstats_3=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select replication.data_hash, replication.data_count from #memstats');
CompareMemstats "${memstats_3[@]}" "${memstats_expected[@]}" $server_pid;

KillAndRemoveServer $server_pid;

echo "## installing latest release: $LATEST_RELEASE"
yum install -y $LATEST_RELEASE > /dev/null;
reindexer_server -l warning --httplog=none --rpclog=none --db $DB_PATH &
server_pid=$!
sleep 2;

WaitForDB

namespaces_4=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command '\namespaces list');
echo $namespaces_4;
CompareNamespacesLists "${namespaces_4[@]}" "${namespaces_3[@]}" $server_pid;

memstats_4=$(reindexer_tool --dsn $ADDRESS$DB_NAME --command 'select replication.data_hash, replication.data_count from #memstats');
CompareMemstats "${memstats_4[@]}" "${memstats_3[@]}" $server_pid;

KillAndRemoveServer $server_pid;
rm -rf $DB_PATH;
30 changes: 24 additions & 6 deletions cpp_src/core/ft/config/baseftconfig.cc
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@

#include "baseftconfig.h"
#include <string.h>
#include "core/cjson/jsonbuilder.h"
#include "core/ft/stopwords/stop.h"
#include "tools/errors.h"
#include "tools/jsontools.h"

namespace reindexer {

BaseFTConfig::BaseFTConfig() {
for (const char **p = stop_words_en; *p != nullptr; p++) stopWords.insert(*p);
for (const char **p = stop_words_ru; *p != nullptr; p++) stopWords.insert(*p);
for (const char **p = stop_words_en; *p != nullptr; p++) stopWords.insert({*p, StopWord::Type::Morpheme});
for (const char **p = stop_words_ru; *p != nullptr; p++) stopWords.insert({*p, StopWord::Type::Morpheme});
}

void BaseFTConfig::parseBase(const gason::JsonNode &root) {
Expand All @@ -25,7 +23,25 @@ void BaseFTConfig::parseBase(const gason::JsonNode &root) {
auto &stopWordsNode = root["stop_words"];
if (!stopWordsNode.empty()) {
stopWords.clear();
for (auto &sw : stopWordsNode) stopWords.insert(sw.As<std::string>());
for (auto &sw : stopWordsNode) {
std::string word;
StopWord::Type type = StopWord::Type::Stop;
if (sw.value.getTag() == gason::JsonTag::JSON_STRING) {
word = sw.As<std::string>();
} else if (sw.value.getTag() == gason::JsonTag::JSON_OBJECT) {
word = sw["word"].As<std::string>();
type = sw["is_morpheme"].As<bool>() ? StopWord::Type::Morpheme : StopWord::Type::Stop;
}

if (std::find_if(word.begin(), word.end(), [](const auto &symbol) { return std::isspace(symbol); }) != word.end()) {
throw Error(errParams, "Stop words can't contain spaces: %s", word);
}

auto [it, inserted] = stopWords.emplace(std::move(word), type);
if (!inserted && it->type != type) {
throw Error(errParams, "Duplicate stop-word with different morpheme attribute: %s", *it);
}
}
}

auto &stemmersNode = root["stemmers"];
Expand Down Expand Up @@ -80,7 +96,9 @@ void BaseFTConfig::getJson(JsonBuilder &jsonBuilder) const {
{
auto stopWordsNode = jsonBuilder.Array("stop_words");
for (const auto &sw : stopWords) {
stopWordsNode.Put(nullptr, sw);
auto wordNode = stopWordsNode.Object(nullptr);
wordNode.Put("word", sw);
wordNode.Put("is_morpheme", sw.type == StopWord::Type::Morpheme);
}
}
{
Expand Down
9 changes: 8 additions & 1 deletion cpp_src/core/ft/config/baseftconfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ static constexpr int kMinMergeLimitValue = 0;

class JsonBuilder;

struct StopWord : std::string {
enum class Type { Stop, Morpheme };
StopWord(std::string base, Type type = Type::Stop) noexcept : std::string(std::move(base)), type(type) {}
Type type;
};

class BaseFTConfig {
public:
struct Synonym {
Expand All @@ -39,7 +45,8 @@ class BaseFTConfig {
bool enableKbLayout = true;
bool enableNumbersSearch = false;
bool enableWarmupOnNsCopy = false;
fast_hash_set<std::string, hash_str, equal_str, less_str> stopWords;

fast_hash_set<StopWord, hash_str, equal_str, less_str> stopWords;
std::vector<Synonym> synonyms;
int logLevel = 0;
std::string extraWordSymbols = "-/+"; // word contains symbols (IsAlpa | IsDigit) {IsAlpa | IsDigit | IsExtra}
Expand Down
Loading

0 comments on commit 3a31b6f

Please sign in to comment.