Skip to content

Commit

Permalink
Vectorize text equality and LIKE (#6189)
Browse files Browse the repository at this point in the history
This PR adds vectorized computation of text equality for deterministic
collations, and case-sensitive LIKE for UTF-8 database encoding.
  • Loading branch information
akuzm authored Mar 28, 2024
1 parent ea5c7f1 commit a3d03ea
Show file tree
Hide file tree
Showing 14 changed files with 1,024 additions and 56 deletions.
1 change: 1 addition & 0 deletions tsl/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,5 @@ install(TARGETS ${TSL_LIBRARY_NAME} DESTINATION ${PG_PKGLIBDIR})
add_subdirectory(bgw_policy)
add_subdirectory(compression)
add_subdirectory(continuous_aggs)
add_subdirectory(import)
add_subdirectory(nodes)
6 changes: 3 additions & 3 deletions tsl/src/compression/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -499,10 +499,10 @@ text_array_decompress_all_serialized_no_header(StringInfo si, bool has_nulls,
CheckCompressedData(n_total >= n_notnull);

uint32 *offsets =
(uint32 *) MemoryContextAllocZero(dest_mctx,
pad_to_multiple(64, sizeof(*offsets) * (n_total + 1)));
(uint32 *) MemoryContextAlloc(dest_mctx,
pad_to_multiple(64, sizeof(*offsets) * (n_total + 1)));
uint8 *arrow_bodies =
(uint8 *) MemoryContextAllocZero(dest_mctx, pad_to_multiple(64, si->len - si->cursor));
(uint8 *) MemoryContextAlloc(dest_mctx, pad_to_multiple(64, si->len - si->cursor));

uint32 offset = 0;
for (uint32 i = 0; i < n_notnull; i++)
Expand Down
2 changes: 2 additions & 0 deletions tsl/src/import/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
set(SOURCES "")
target_sources(${PROJECT_NAME} PRIVATE ${SOURCES})
211 changes: 211 additions & 0 deletions tsl/src/import/ts_like_match.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
/*
* This file and its contents are licensed under the Timescale License.
* Please see the included NOTICE for copyright information and
* LICENSE-TIMESCALE for a copy of the license.
*/

/*
* This file contains source code that was copied and/or modified from
* the PostgreSQL database, which is licensed under the open-source
* PostgreSQL License. Please see the NOTICE at the top level
* directory for a copy of the PostgreSQL License.
*
* This is a copy of backend/utils/adt/like_match.c from PG 15.0, git commit sha
* 2a7ce2e2ce474504a707ec03e128fde66cfb8b48.
* It has one modification: the check_stack_depth() check is moved to happen
* before recursion to simplify the non-recursive code path.
*/

/*--------------------
* Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
*
* LIKE_TRUE: they match
* LIKE_FALSE: they don't match
* LIKE_ABORT: not only don't they match, but the text is too short.
*
* If LIKE_ABORT is returned, then no suffix of the text can match the
* pattern either, so an upper-level % scan can stop scanning now.
*--------------------
*/

#ifdef MATCH_LOWER
#define GETCHAR(t) MATCH_LOWER(t)
#else
#define GETCHAR(t) (t)
#endif

static int
MatchText(const char *t, int tlen, const char *p, int plen)
{
/* Fast path for match-everything pattern */
if (plen == 1 && *p == '%')
return LIKE_TRUE;

/*
* In this loop, we advance by char when matching wildcards (and thus on
* recursive entry to this function we are properly char-synced). On other
* occasions it is safe to advance by byte, as the text and pattern will
* be in lockstep. This allows us to perform all comparisons between the
* text and pattern on a byte by byte basis, even for multi-byte
* encodings.
*/
while (tlen > 0 && plen > 0)
{
if (*p == '\\')
{
/* Next pattern byte must match literally, whatever it is */
NextByte(p, plen);
/* ... and there had better be one, per SQL standard */
if (plen <= 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("LIKE pattern must not end with escape character")));
if (GETCHAR(*p) != GETCHAR(*t))
return LIKE_FALSE;
}
else if (*p == '%')
{
char firstpat;

/*
* % processing is essentially a search for a text position at
* which the remainder of the text matches the remainder of the
* pattern, using a recursive call to check each potential match.
*
* If there are wildcards immediately following the %, we can skip
* over them first, using the idea that any sequence of N _'s and
* one or more %'s is equivalent to N _'s and one % (ie, it will
* match any sequence of at least N text characters). In this way
* we will always run the recursive search loop using a pattern
* fragment that begins with a literal character-to-match, thereby
* not recursing more than we have to.
*/
NextByte(p, plen);

while (plen > 0)
{
if (*p == '%')
NextByte(p, plen);
else if (*p == '_')
{
/* If not enough text left to match the pattern, ABORT */
if (tlen <= 0)
return LIKE_ABORT;
NextChar(t, tlen);
NextByte(p, plen);
}
else
break; /* Reached a non-wildcard pattern char */
}

/*
* If we're at end of pattern, match: we have a trailing % which
* matches any remaining text string.
*/
if (plen <= 0)
return LIKE_TRUE;

/*
* Otherwise, scan for a text position at which we can match the
* rest of the pattern. The first remaining pattern char is known
* to be a regular or escaped literal character, so we can compare
* the first pattern byte to each text byte to avoid recursing
* more than we have to. This fact also guarantees that we don't
* have to consider a match to the zero-length substring at the
* end of the text.
*/
if (*p == '\\')
{
if (plen < 2)
ereport(ERROR,
(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
errmsg("LIKE pattern must not end with escape character")));
firstpat = GETCHAR(p[1]);
}
else
firstpat = GETCHAR(*p);

while (tlen > 0)
{
if (GETCHAR(*t) == firstpat)
{
/* Since this function recurses, it could be driven to stack overflow */
check_stack_depth();

int matched = MatchText(t, tlen, p, plen);

if (matched != LIKE_FALSE)
return matched; /* TRUE or ABORT */
}

NextChar(t, tlen);
}

/*
* End of text with no match, so no point in trying later places
* to start matching this pattern.
*/
return LIKE_ABORT;
}
else if (*p == '_')
{
/* _ matches any single character, and we know there is one */
NextChar(t, tlen);
NextByte(p, plen);
continue;
}
else if (GETCHAR(*p) != GETCHAR(*t))
{
/* non-wildcard pattern char fails to match text char */
return LIKE_FALSE;
}

/*
* Pattern and text match, so advance.
*
* It is safe to use NextByte instead of NextChar here, even for
* multi-byte character sets, because we are not following immediately
* after a wildcard character. If we are in the middle of a multibyte
* character, we must already have matched at least one byte of the
* character from both text and pattern; so we cannot get out-of-sync
* on character boundaries. And we know that no backend-legal
* encoding allows ASCII characters such as '%' to appear as non-first
* bytes of characters, so we won't mistakenly detect a new wildcard.
*/
NextByte(t, tlen);
NextByte(p, plen);
}

if (tlen > 0)
return LIKE_FALSE; /* end of pattern, but not of text */

/*
* End of text, but perhaps not of pattern. Match iff the remaining
* pattern can match a zero-length string, ie, it's zero or more %'s.
*/
while (plen > 0 && *p == '%')
NextByte(p, plen);
if (plen <= 0)
return LIKE_TRUE;

/*
* End of text with no match, so no point in trying later places to start
* matching this pattern.
*/
return LIKE_ABORT;
} /* MatchText() */

#ifdef CHAREQ
#undef CHAREQ
#endif

#undef NextChar
#undef CopyAdvChar
#undef MatchText

#undef GETCHAR

#ifdef MATCH_LOWER
#undef MATCH_LOWER

#endif
1 change: 1 addition & 0 deletions tsl/src/nodes/decompress_chunk/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(SOURCES
${CMAKE_CURRENT_SOURCE_DIR}/detoaster.c
${CMAKE_CURRENT_SOURCE_DIR}/exec.c
${CMAKE_CURRENT_SOURCE_DIR}/planner.c
${CMAKE_CURRENT_SOURCE_DIR}/pred_text.c
${CMAKE_CURRENT_SOURCE_DIR}/pred_vector_array.c
${CMAKE_CURRENT_SOURCE_DIR}/qual_pushdown.c
${CMAKE_CURRENT_SOURCE_DIR}/vector_predicates.c)
Expand Down
Loading

0 comments on commit a3d03ea

Please sign in to comment.