From 86f0be1e57f09c9a00af4424ffe93336d1dc1528 Mon Sep 17 00:00:00 2001 From: "Denis Kuzmin [ GitHub/3F ]" Date: Sat, 8 Aug 2020 15:54:14 +0300 Subject: [PATCH] Second-order Quantifiers. Added support for `++` Updated tests. Our subset of the regex quantifiers now is standardized as follow: regex | regXwild ---------|---------- .* | * .+ | + .? | ? .{1} | # .{2} | ## .{2, } | ++ .{0, 2} | ?? --- Readme.md | 44 +++++++++----- regXwild/core/ESS/AlgorithmEss.cpp | 27 ++++++++- regXwild/core/ESS/AlgorithmEss.h | 6 +- regXwildTest/AlgorithmEssTest.cpp | 25 +++++++- regXwildTest/EssRangesTest.cpp | 92 ++++++++++++++++++++++++++++++ 5 files changed, 172 insertions(+), 22 deletions(-) diff --git a/Readme.md b/Readme.md index ecbdd9b..72738d0 100644 --- a/Readme.md +++ b/Readme.md @@ -1,15 +1,15 @@ # [regXwild](https://github.com/3F/regXwild) -Small and super Fast advanced wildcards! `*`,`|`,`?`,`^`,`$`,`+`,`#`,`>` in addition to slow regex engine and more. +Small and super Fast Advanced wildcards! `*`,`|`,`?`,`^`,`$`,`+`,`#`,`>` in addition to slow regex engines and more. Unique algorithms that was implemented on native unmanaged C++ but easily accessible also in .NET through **[Conari](https://github.com/3F/Conari)** (recommended due to caching of 0x29 opcodes and other related optimization). [![Build status](https://ci.appveyor.com/api/projects/status/8knio1ggle0o8ugh/branch/master?svg=true)](https://ci.appveyor.com/project/3Fs/regxwild-github/branch/master) -[![release](https://img.shields.io/github/release/3F/regXwild.svg)](https://github.com/3F/regXwild/releases/latest) -[![License](https://img.shields.io/badge/License-MIT-74A5C2.svg)](https://github.com/3F/regXwild/blob/master/LICENSE) -[![NuGet package](https://img.shields.io/nuget/v/regXwild.svg)](https://www.nuget.org/packages/regXwild/) -[![Tests](https://img.shields.io/appveyor/tests/3Fs/regxwild-github/master.svg)](https://ci.appveyor.com/project/3Fs/regxwild-github/build/tests) +[![release](https://img.shields.io/github/v/release/3F/regXwild)](https://github.com/3F/regXwild/releases/latest) +[![License](https://img.shields.io/badge/License-MIT-74A5C2)](https://github.com/3F/regXwild/blob/master/LICENSE) +[![NuGet package](https://img.shields.io/nuget/v/regXwild)](https://www.nuget.org/packages/regXwild/) +[![Tests](https://img.shields.io/appveyor/tests/3Fs/regxwild-github/master)](https://ci.appveyor.com/project/3Fs/regxwild-github/build/tests) [![Build history](https://buildstats.info/appveyor/chart/3Fs/regxwild-github?buildCount=20&includeBuildsFromPullRequest=true&showStats=true)](https://ci.appveyor.com/project/3Fs/regxwild-github/history) @@ -30,7 +30,7 @@ Unique algorithms that was implemented on native unmanaged C++ but easily access It was designed to be faster than just fast, when using more features that usually go beyond the typical wildcards. -🔍 Easy to start: +### 🔍 Easy to start Unmanaged native C++ or managed .NET project. It doesn't matter, just use it: @@ -44,7 +44,7 @@ if(searchEssC(_T("regXwild"), _T("reg?wild"), true)) { } ``` -C# if you're using [ [Conari](https://github.com/3F/Conari) ] +C# if [ [Conari](https://github.com/3F/Conari) ] ```csharp using(var l = new ConariL("regXwild.dll")) { @@ -54,7 +54,7 @@ using(var l = new ConariL("regXwild.dll")) } ``` -🏄 Amazing meta symbols: +### 🏄 Amazing meta symbols ESS version (advanced EXT version) @@ -63,11 +63,11 @@ enum MetaSymbols { MS_ANY = _T('*'), // {0, ~} MS_SPLIT = _T('|'), // str1 or str2 or ... - MS_ONE = _T('?'), // {0, 1}, ??? - {0, 3}, ... + MS_ONE = _T('?'), // {0, 1}, ??? {0, 3}, ... MS_BEGIN = _T('^'), // [str... or [str1... |[str2... MS_END = _T('$'), // ...str] or ...str1]| ...str2] - MS_MORE = _T('+'), // {1, ~} - MS_SINGLE = _T('#'), // {1} + MS_MORE = _T('+'), // {1, ~}, +++ {3, ~}, ... + MS_SINGLE = _T('#'), // {1}, ## {2}, ### {3}, ... MS_ANYSP = _T('>'), // as [^/]* }; ``` @@ -84,14 +84,28 @@ enum MetaSymbols }; ``` -Check it with our actual **Unit-Tests**. +🧮 Quantifiers -🚀 Awesome speed: +regex | regXwild +---------|---------- +.* | * +.+ | + +.? | ? +.{1} | # +.{2} | ## +.{2, } | ++ +.{0, 2} | ?? + +and similar ... + +Play with our actual **Unit-Tests**. + +### 🚀 Awesome speed * [~2000 times faster when C++](#speed). * For .NET (including modern .NET Core), [Conari](https://github.com/3F/Conari) provides optional caching of 0x29 opcodes (Calli) and more to get a similar result as possible. -🍰 Open and Free: +### 🍰 Open and Free Open Source project; MIT License, Enjoy 🎉 @@ -114,7 +128,7 @@ We're waiting for your awesome contributions! Please note: * **+icase** means ignore case sensitivity when matching the filter(pattern) within the searched string, i.e. `ignoreCase = true`. **Without** this, everything **will be much faster** of course. *That is, icase always adds complexity.* -* Commonly **MultiByte** will be faster than **Unicode** (for the same platform and the same way of module use) but it depends on specific architecture and can be about ~2 times faster when native C++, and about ~4 times faster when .NET + Conari and related. +* Below, **MultiByte** can be faster than **Unicode** (for the same platform and the same way of module use) but it depends on specific architecture and can be about ~2 times faster when native C++, and about ~4 times faster when .NET + Conari and related. * The results below can be different on different machines. You need only look at the difference (in milliseconds) between algorithms for a specific target. * To calculate the data, as in the table below, you need execute `algo.exe` diff --git a/regXwild/core/ESS/AlgorithmEss.cpp b/regXwild/core/ESS/AlgorithmEss.cpp index e88a709..52474e7 100644 --- a/regXwild/core/ESS/AlgorithmEss.cpp +++ b/regXwild/core/ESS/AlgorithmEss.cpp @@ -117,9 +117,11 @@ bool AlgorithmEss::search(const tstring& text, const tstring& filter, bool ignor if(rewindToNextBlock(it)){ continue; } return false; } - // Sequential combinations of characters SINGLE & ONE - if((item.mask.curr & SINGLE && item.mask.prev & SINGLE) || - (item.mask.curr & ONE && item.mask.prev & ONE)){ + // Sequential combinations of #, ?, + + if((item.mask.curr & SINGLE && item.mask.prev & SINGLE) + || (item.mask.curr & ONE && item.mask.prev & ONE) + || (item.mask.curr & MORE && item.mask.prev & MORE)) + { ++item.overlay; } else{ item.overlay = 0; } @@ -261,6 +263,25 @@ udiff_t AlgorithmEss::interval() return words.found; } + // "+" + if(item.mask.prev & MORE) + { + udiff_t len = item.prev.length(); + diff_t lPosMax = words.found - len; + diff_t plim = words.found - words.left; + diff_t lPos = lPosMax - plim - 1; + + if(item.overlay > plim) { // When filter ++++ (4 or more) is more than origin data. + return tstring::npos; + } + + if(_text.substr(lPos, len).compare(item.prev) == 0) { + return words.found; + } + + return tstring::npos; + } + // "?" if(item.mask.prev & ONE && (words.found - words.left) > 1) { diff --git a/regXwild/core/ESS/AlgorithmEss.h b/regXwild/core/ESS/AlgorithmEss.h index fad0c8f..cf1d34a 100644 --- a/regXwild/core/ESS/AlgorithmEss.h +++ b/regXwild/core/ESS/AlgorithmEss.h @@ -53,11 +53,11 @@ namespace net { namespace r_eg { namespace regXwild { namespace core { namespace { MS_ANY = _T('*'), // {0, ~} MS_SPLIT = _T('|'), // str1 or str2 or ... - MS_ONE = _T('?'), // {0, 1}, ??? - {0, 3}, ... + MS_ONE = _T('?'), // {0, 1}, ??? {0, 3}, ... MS_BEGIN = _T('^'), // [str... or [str1... |[str2... MS_END = _T('$'), // ...str] or ...str1]| ...str2] - MS_MORE = _T('+'), // {1, ~} - MS_SINGLE = _T('#'), // {1} + MS_MORE = _T('+'), // {1, ~}, +++ {3, ~}, ... + MS_SINGLE = _T('#'), // {1}, ## {2}, ### {3}, ... MS_ANYSP = _T('>'), // as [^/]* //TODO: >\>/ i.e. '>' + {symbol} }; diff --git a/regXwildTest/AlgorithmEssTest.cpp b/regXwildTest/AlgorithmEssTest.cpp index 72819de..01bca25 100644 --- a/regXwildTest/AlgorithmEssTest.cpp +++ b/regXwildTest/AlgorithmEssTest.cpp @@ -351,7 +351,7 @@ namespace regXwildTest Assert::AreEqual(true, searchEss(data, _T("new++systems"))); Assert::AreEqual(true, searchEss(data, _T("+systems"))); Assert::AreEqual(true, searchEss(data, _T("project+12"))); - Assert::AreEqual(true, searchEss(data, _T("project++12"))); + Assert::AreEqual(false, searchEss(data, _T("project++12"))); Assert::AreEqual(true, searchEss(data, _T("75+*systems"))); Assert::AreEqual(true, searchEss(data, _T("75*+*systems"))); Assert::AreEqual(true, searchEss(data, _T("new+7+system"))); @@ -412,6 +412,26 @@ namespace regXwildTest } } + TEST_METHOD(filterMoreTest6) + { + tstring data = _T("new project20+ 10-pro data"); + + Assert::AreEqual(true, searchEss(data, _T("++"))); + Assert::AreEqual(true, searchEss(data, _T("+++"))); + Assert::AreEqual(true, searchEss(data, _T("++++"))); + Assert::AreEqual(true, searchEss(data, _T("++++proj"))); + + Assert::AreEqual(false, searchEss(data, _T("+++++proj"))); + + Assert::AreEqual(false, searchEss(data, _T("project+20"))); + Assert::AreEqual(true, searchEss(data, _T("project+10"))); + Assert::AreEqual(true, searchEss(data, _T("project++10"))); + Assert::AreEqual(true, searchEss(data, _T("project+++10"))); + Assert::AreEqual(true, searchEss(data, _T("project++++10"))); + Assert::AreEqual(false, searchEss(data, _T("project+++++10"))); + Assert::AreEqual(false, searchEss(data, _T("project++++++10"))); + } + TEST_METHOD(filterBeginTest1) { tstring data = _T("new tes;ted project-12, and 75_protection of various systems"); @@ -1007,6 +1027,9 @@ namespace regXwildTest Assert::IsTrue(searchEss(data, _T("1_of"))); Assert::IsTrue(searchEss(data, _T("[1??_of"))); Assert::IsTrue(searchEss(data, _T("[1???_of"))); + Assert::IsTrue(searchEss(data, _T("[1+_of"))); + Assert::IsFalse(searchEss(data, _T("[1++_of"))); + Assert::IsFalse(searchEss(data, _T("[1+++_of"))); } TEST_METHOD(underscoreTest2) diff --git a/regXwildTest/EssRangesTest.cpp b/regXwildTest/EssRangesTest.cpp index 269e4ed..2770852 100644 --- a/regXwildTest/EssRangesTest.cpp +++ b/regXwildTest/EssRangesTest.cpp @@ -113,6 +113,52 @@ namespace regXwildTest Assert::IsFalse(searchEss(_T("number = '12345';"), filter)); } + TEST_METHOD(rangeAtMoreTest1) + { + tstring data = _T("number = '123';"); + + Assert::IsTrue(searchEss(data, _T("number = '+++';"))); + Assert::IsTrue(searchEss(data, _T("number = '++';"))); + Assert::IsTrue(searchEss(data, _T("number = '+';"))); + Assert::IsFalse(searchEss(data, _T("number = '++++';"))); + Assert::IsFalse(searchEss(data, _T("number = '+++++';"))); + } + + TEST_METHOD(rangeAtMoreTest2) + { + tstring filter = _T("number = '+++';"); + + // +++ means 3 or more + Assert::IsFalse(searchEss(_T("number = '';"), filter)); + Assert::IsFalse(searchEss(_T("number = '1';"), filter)); + Assert::IsFalse(searchEss(_T("number = '12';"), filter)); + Assert::IsTrue(searchEss(_T("number = '123';"), filter)); + Assert::IsTrue(searchEss(_T("number = '1234';"), filter)); + } + + TEST_METHOD(rangeAtMoreTest3) + { + tstring data = _T("number = '123';"); + + Assert::IsTrue(searchEss(data, _T("ber = '+++';"))); + Assert::IsTrue(searchEss(data, _T("ber = '++';"))); + Assert::IsTrue(searchEss(data, _T("ber = '+';"))); + Assert::IsFalse(searchEss(data, _T("ber = '++++';"))); + Assert::IsFalse(searchEss(data, _T("ber = '+++++';"))); + } + + TEST_METHOD(rangeAtMoreTest4) + { + tstring filter = _T("ber = '+++';"); + + // +++ means 3 or more + Assert::IsFalse(searchEss(_T("number = '';"), filter)); + Assert::IsFalse(searchEss(_T("number = '1';"), filter)); + Assert::IsFalse(searchEss(_T("number = '12';"), filter)); + Assert::IsTrue(searchEss(_T("number = '123';"), filter)); + Assert::IsTrue(searchEss(_T("number = '1234';"), filter)); + } + TEST_METHOD(rangeAtAnyTest1) { tstring data = _T("number = '123';"); @@ -451,6 +497,52 @@ namespace regXwildTest Assert::IsFalse(searchEss(_T("number = '12345';"), filter)); } + TEST_METHOD(limRangeAtMoreTest1) + { + tstring data = _T("number = '123';"); + + Assert::IsTrue(searchEss(data, _T("number = '+++'"))); + Assert::IsTrue(searchEss(data, _T("number = '++'"))); + Assert::IsTrue(searchEss(data, _T("number = '+'"))); + Assert::IsFalse(searchEss(data, _T("number = '++++'"))); + Assert::IsFalse(searchEss(data, _T("number = '+++++'"))); + } + + TEST_METHOD(limRangeAtMoreTest2) + { + tstring filter = _T("number = '+++'"); + + // +++ means 3 or more + Assert::IsFalse(searchEss(_T("number = '';"), filter)); + Assert::IsFalse(searchEss(_T("number = '1';"), filter)); + Assert::IsFalse(searchEss(_T("number = '12';"), filter)); + Assert::IsTrue(searchEss(_T("number = '123';"), filter)); + Assert::IsTrue(searchEss(_T("number = '1234';"), filter)); + } + + TEST_METHOD(limRangeAtMoreTest3) + { + tstring data = _T("number = '123';"); + + Assert::IsTrue(searchEss(data, _T("ber = '+++'"))); + Assert::IsTrue(searchEss(data, _T("ber = '++'"))); + Assert::IsTrue(searchEss(data, _T("ber = '+'"))); + Assert::IsFalse(searchEss(data, _T("ber = '++++'"))); + Assert::IsFalse(searchEss(data, _T("ber = '+++++'"))); + } + + TEST_METHOD(limRangeAtMoreTest4) + { + tstring filter = _T("ber = '+++'"); + + // +++ means 3 or more + Assert::IsFalse(searchEss(_T("number = '';"), filter)); + Assert::IsFalse(searchEss(_T("number = '1';"), filter)); + Assert::IsFalse(searchEss(_T("number = '12';"), filter)); + Assert::IsTrue(searchEss(_T("number = '123';"), filter)); + Assert::IsTrue(searchEss(_T("number = '1234';"), filter)); + } + TEST_METHOD(limRangeAtAnyTest1) { tstring data = _T("number = '123';");