Vectorize text equality and LIKE (#6189)

This PR adds vectorized computation of text equality for deterministic collations, and case-sensitive LIKE for UTF-8 database encoding.
timescale · Mar 28, 2024 · a3d03ea · a3d03ea
1 parent ea5c7f1
commit a3d03ea
Show file tree

Hide file tree

Showing 14 changed files with 1,024 additions and 56 deletions.
diff --git a/tsl/src/CMakeLists.txt b/tsl/src/CMakeLists.txt
@@ -51,4 +51,5 @@ install(TARGETS ${TSL_LIBRARY_NAME} DESTINATION ${PG_PKGLIBDIR})
 add_subdirectory(bgw_policy)
 add_subdirectory(compression)
 add_subdirectory(continuous_aggs)
+add_subdirectory(import)
 add_subdirectory(nodes)
diff --git a/tsl/src/compression/array.c b/tsl/src/compression/array.c
@@ -499,10 +499,10 @@ text_array_decompress_all_serialized_no_header(StringInfo si, bool has_nulls,
 	CheckCompressedData(n_total >= n_notnull);
 
 	uint32 *offsets =
-		(uint32 *) MemoryContextAllocZero(dest_mctx,
-										  pad_to_multiple(64, sizeof(*offsets) * (n_total + 1)));
+		(uint32 *) MemoryContextAlloc(dest_mctx,
+									  pad_to_multiple(64, sizeof(*offsets) * (n_total + 1)));
 	uint8 *arrow_bodies =
-		(uint8 *) MemoryContextAllocZero(dest_mctx, pad_to_multiple(64, si->len - si->cursor));
+		(uint8 *) MemoryContextAlloc(dest_mctx, pad_to_multiple(64, si->len - si->cursor));
 
 	uint32 offset = 0;
 	for (uint32 i = 0; i < n_notnull; i++)

diff --git a/tsl/src/import/CMakeLists.txt b/tsl/src/import/CMakeLists.txt
@@ -0,0 +1,2 @@
+set(SOURCES "")
+target_sources(${PROJECT_NAME} PRIVATE ${SOURCES})
diff --git a/tsl/src/import/ts_like_match.c b/tsl/src/import/ts_like_match.c
@@ -0,0 +1,211 @@
+/*
+ * This file and its contents are licensed under the Timescale License.
+ * Please see the included NOTICE for copyright information and
+ * LICENSE-TIMESCALE for a copy of the license.
+ */
+
+/*
+ * This file contains source code that was copied and/or modified from
+ * the PostgreSQL database, which is licensed under the open-source
+ * PostgreSQL License. Please see the NOTICE at the top level
+ * directory for a copy of the PostgreSQL License.
+ *
+ * This is a copy of backend/utils/adt/like_match.c from PG 15.0, git commit sha
+ * 2a7ce2e2ce474504a707ec03e128fde66cfb8b48.
+ * It has one modification: the check_stack_depth() check is moved to happen
+ * before recursion to simplify the non-recursive code path.
+ */
+
+/*--------------------
+ *	Match text and pattern, return LIKE_TRUE, LIKE_FALSE, or LIKE_ABORT.
+ *
+ *	LIKE_TRUE: they match
+ *	LIKE_FALSE: they don't match
+ *	LIKE_ABORT: not only don't they match, but the text is too short.
+ *
+ * If LIKE_ABORT is returned, then no suffix of the text can match the
+ * pattern either, so an upper-level % scan can stop scanning now.
+ *--------------------
+ */
+
+#ifdef MATCH_LOWER
+#define GETCHAR(t) MATCH_LOWER(t)
+#else
+#define GETCHAR(t) (t)
+#endif
+
+static int
+MatchText(const char *t, int tlen, const char *p, int plen)
+{
+	/* Fast path for match-everything pattern */
+	if (plen == 1 && *p == '%')
+		return LIKE_TRUE;
+
+	/*
+	 * In this loop, we advance by char when matching wildcards (and thus on
+	 * recursive entry to this function we are properly char-synced). On other
+	 * occasions it is safe to advance by byte, as the text and pattern will
+	 * be in lockstep. This allows us to perform all comparisons between the
+	 * text and pattern on a byte by byte basis, even for multi-byte
+	 * encodings.
+	 */
+	while (tlen > 0 && plen > 0)
+	{
+		if (*p == '\\')
+		{
+			/* Next pattern byte must match literally, whatever it is */
+			NextByte(p, plen);
+			/* ... and there had better be one, per SQL standard */
+			if (plen <= 0)
+				ereport(ERROR,
+						(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+						 errmsg("LIKE pattern must not end with escape character")));
+			if (GETCHAR(*p) != GETCHAR(*t))
+				return LIKE_FALSE;
+		}
+		else if (*p == '%')
+		{
+			char firstpat;
+
+			/*
+			 * % processing is essentially a search for a text position at
+			 * which the remainder of the text matches the remainder of the
+			 * pattern, using a recursive call to check each potential match.
+			 *
+			 * If there are wildcards immediately following the %, we can skip
+			 * over them first, using the idea that any sequence of N _'s and
+			 * one or more %'s is equivalent to N _'s and one % (ie, it will
+			 * match any sequence of at least N text characters).  In this way
+			 * we will always run the recursive search loop using a pattern
+			 * fragment that begins with a literal character-to-match, thereby
+			 * not recursing more than we have to.
+			 */
+			NextByte(p, plen);
+
+			while (plen > 0)
+			{
+				if (*p == '%')
+					NextByte(p, plen);
+				else if (*p == '_')
+				{
+					/* If not enough text left to match the pattern, ABORT */
+					if (tlen <= 0)
+						return LIKE_ABORT;
+					NextChar(t, tlen);
+					NextByte(p, plen);
+				}
+				else
+					break; /* Reached a non-wildcard pattern char */
+			}
+
+			/*
+			 * If we're at end of pattern, match: we have a trailing % which
+			 * matches any remaining text string.
+			 */
+			if (plen <= 0)
+				return LIKE_TRUE;
+
+			/*
+			 * Otherwise, scan for a text position at which we can match the
+			 * rest of the pattern.  The first remaining pattern char is known
+			 * to be a regular or escaped literal character, so we can compare
+			 * the first pattern byte to each text byte to avoid recursing
+			 * more than we have to.  This fact also guarantees that we don't
+			 * have to consider a match to the zero-length substring at the
+			 * end of the text.
+			 */
+			if (*p == '\\')
+			{
+				if (plen < 2)
+					ereport(ERROR,
+							(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
+							 errmsg("LIKE pattern must not end with escape character")));
+				firstpat = GETCHAR(p[1]);
+			}
+			else
+				firstpat = GETCHAR(*p);
+
+			while (tlen > 0)
+			{
+				if (GETCHAR(*t) == firstpat)
+				{
+					/* Since this function recurses, it could be driven to stack overflow */
+					check_stack_depth();
+
+					int matched = MatchText(t, tlen, p, plen);
+
+					if (matched != LIKE_FALSE)
+						return matched; /* TRUE or ABORT */
+				}
+
+				NextChar(t, tlen);
+			}
+
+			/*
+			 * End of text with no match, so no point in trying later places
+			 * to start matching this pattern.
+			 */
+			return LIKE_ABORT;
+		}
+		else if (*p == '_')
+		{
+			/* _ matches any single character, and we know there is one */
+			NextChar(t, tlen);
+			NextByte(p, plen);
+			continue;
+		}
+		else if (GETCHAR(*p) != GETCHAR(*t))
+		{
+			/* non-wildcard pattern char fails to match text char */
+			return LIKE_FALSE;
+		}
+
+		/*
+		 * Pattern and text match, so advance.
+		 *
+		 * It is safe to use NextByte instead of NextChar here, even for
+		 * multi-byte character sets, because we are not following immediately
+		 * after a wildcard character. If we are in the middle of a multibyte
+		 * character, we must already have matched at least one byte of the
+		 * character from both text and pattern; so we cannot get out-of-sync
+		 * on character boundaries.  And we know that no backend-legal
+		 * encoding allows ASCII characters such as '%' to appear as non-first
+		 * bytes of characters, so we won't mistakenly detect a new wildcard.
+		 */
+		NextByte(t, tlen);
+		NextByte(p, plen);
+	}
+
+	if (tlen > 0)
+		return LIKE_FALSE; /* end of pattern, but not of text */
+
+	/*
+	 * End of text, but perhaps not of pattern.  Match iff the remaining
+	 * pattern can match a zero-length string, ie, it's zero or more %'s.
+	 */
+	while (plen > 0 && *p == '%')
+		NextByte(p, plen);
+	if (plen <= 0)
+		return LIKE_TRUE;
+
+	/*
+	 * End of text with no match, so no point in trying later places to start
+	 * matching this pattern.
+	 */
+	return LIKE_ABORT;
+} /* MatchText() */
+
+#ifdef CHAREQ
+#undef CHAREQ
+#endif
+
+#undef NextChar
+#undef CopyAdvChar
+#undef MatchText
+
+#undef GETCHAR
+
+#ifdef MATCH_LOWER
+#undef MATCH_LOWER
+
+#endif
diff --git a/tsl/src/nodes/decompress_chunk/CMakeLists.txt b/tsl/src/nodes/decompress_chunk/CMakeLists.txt
@@ -8,6 +8,7 @@ set(SOURCES
     ${CMAKE_CURRENT_SOURCE_DIR}/detoaster.c
     ${CMAKE_CURRENT_SOURCE_DIR}/exec.c
     ${CMAKE_CURRENT_SOURCE_DIR}/planner.c
+    ${CMAKE_CURRENT_SOURCE_DIR}/pred_text.c
     ${CMAKE_CURRENT_SOURCE_DIR}/pred_vector_array.c
     ${CMAKE_CURRENT_SOURCE_DIR}/qual_pushdown.c
     ${CMAKE_CURRENT_SOURCE_DIR}/vector_predicates.c)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		set(SOURCES "")
		target_sources(${PROJECT_NAME} PRIVATE ${SOURCES})