apache · reshke · Jan 29, 2025 · May 5, 2022 · May 20, 2022 · May 24, 2022
diff --git a/src/backend/access/appendonly/appendonlyam.c b/src/backend/access/appendonly/appendonlyam.c
@@ -2897,7 +2897,7 @@ appendonly_insert(AppendOnlyInsertDesc aoInsertDesc,
 	 */
 	if (need_toast)
 		tup = memtup_toast_insert_or_update(relation, instup,
-											NULL, aoInsertDesc->mt_bind, 0);
+											NULL, aoInsertDesc->mt_bind, aoInsertDesc->toast_tuple_target, 0);
 	else
 		tup = instup;
 

diff --git a/src/backend/access/bitmap/bitmap.c b/src/backend/access/bitmap/bitmap.c
@@ -890,6 +890,9 @@ copy_scan_desc(IndexScanDesc scan)
  *
  * If newentry is false, we're calling the function with a partially filled
  * page table entry. Otherwise, the entry is empty.
+ *
+ * This function is only used in stream bitmap scan, more specifically, it's
+ * BitmapIndexScan + BitmapHeapScan.
  */
 
 static bool
@@ -960,7 +963,7 @@ words_get_match(BMBatchWords *words, BMIterateResult *result,
 	 */
 	if (words->firstTid < result->nextTid)
 	{
-		Assert(words->nwords < 1);
+		Assert(words->nwords == 0);
 		return false;
 	}
 

diff --git a/src/backend/access/bitmap/bitmaputil.c b/src/backend/access/bitmap/bitmaputil.c
@@ -260,9 +260,23 @@ _bitmap_findnexttids(BMBatchWords *words, BMIterateResult *result,
 
 	result->nextTidLoc = result->numOfTids = 0;
 
-	_bitmap_catchup_to_next_tid(words, result);
-
-	Assert(words->firstTid == result->nextTid);
+	/*
+	 * Only in the situation that there have concurrent read/write on two
+	 * adjacent bitmap index pages, and inserting a tid into PAGE_FULL cause expand
+	 * compressed words to new words, and rearrange those words into PAGE_NEXT,
+	 * and we ready to read a new page, we should adjust result-> lastScanWordNo
+	 * to the current position.
+	 *
+	 * The value of words->startNo will always be 0, this value will only used at
+	 * _bitmap_union to union a bunch of bitmaps, the union result will be stored
+	 * at words. result->lastScanWordNo indicates the location in words->cwords that
+	 * BMIterateResult will read the word next, it's start from 0, and will
+	 * self-incrementing during the scan. So if result->lastScanWordNo equals to
+	 * words->startNo, means we will scan a new bitmap index pages.
+	 */
+	if (result->lastScanWordNo == words->startNo &&
+			words->firstTid < result->nextTid)
+		_bitmap_catchup_to_next_tid(words, result);
 
 	while (words->nwords > 0 && result->numOfTids < maxTids && !done)
 	{
@@ -358,10 +372,8 @@ _bitmap_findnexttids(BMBatchWords *words, BMIterateResult *result,
 
 /*
  * _bitmap_catchup_to_next_tid - Catch up to the nextTid we need to check
- * from last iteration.
+ * from last iteration, in the following cases:
  *
- * Normally words->firstTid should equal to result->nextTid. But there
- * are exceptions:
  * 1: When the concurrent insert causes bitmap items from previous full page
  * to spill over to current page in the window when we (the read transaction)
  * had released the lock on the previous page and not locked the current page.

diff --git a/src/backend/access/common/reloptions_gp.c b/src/backend/access/common/reloptions_gp.c
@@ -1968,3 +1968,57 @@ ao_amoptions(Datum reloptions, char relkind, bool validate)
 			return NULL;
 	}
 }
+
+/*
+ * GPDB: Convenience function to judge a relation option whether already in opts
+ */
+bool
+reloptions_has_opt(List *opts, const char *name)
+{
+	ListCell *lc;
+	foreach(lc, opts)
+	{
+		DefElem *de = lfirst(lc);
+		if (pg_strcasecmp(de->defname, name) == 0)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * GPDB: Convenience function to build storage reloptions for a given relation, just for AO table.
+ */
+List *
+build_ao_rel_storage_opts(List *opts, Relation rel)
+{
+	bool		checksum = true;
+	int32		blocksize = -1;
+	int16		compresslevel = 0;
+	char	   *compresstype = NULL;
+	NameData	compresstype_nd;
+
+	GetAppendOnlyEntryAttributes(RelationGetRelid(rel),
+								 &blocksize,
+								 NULL,
+								 &compresslevel,
+								 &checksum,
+								 &compresstype_nd);
+	compresstype = NameStr(compresstype_nd);
+
+	if (!reloptions_has_opt(opts, "blocksize"))
+		opts = lappend(opts, makeDefElem("blocksize", (Node *) makeInteger(blocksize), -1));
+
+	if (!reloptions_has_opt(opts, "compresslevel"))
+		opts = lappend(opts, makeDefElem("compresslevel", (Node *) makeInteger(compresslevel), -1));
+
+	if (!reloptions_has_opt(opts, "checksum"))
+		opts = lappend(opts, makeDefElem("checksum", (Node *) makeInteger(checksum), -1));
+
+	if (!reloptions_has_opt(opts, "compresstype"))
+	{
+		compresstype = (compresstype && compresstype[0]) ? pstrdup(compresstype) : "none";
+		opts = lappend(opts, makeDefElem("compresstype", (Node *) makeString(compresstype), -1));
+	}
+
+	return opts;
+}
diff --git a/src/backend/access/heap/heaptoast.c b/src/backend/access/heap/heaptoast.c
@@ -35,7 +35,7 @@
 
 static void *
 heap_toast_insert_or_update_generic(Relation rel, void *newtup, void *oldtup,
-									MemTupleBinding *pbind, int options, bool ismemtuple);
+									MemTupleBinding *pbind, int options, int toast_tuple_target, bool ismemtuple);
 
 /* ----------
  * heap_toast_delete -
@@ -100,18 +100,18 @@ heap_toast_delete(Relation rel, HeapTuple oldtup, bool is_speculative)
 HeapTuple
 heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, int options)
 {
-	return (HeapTuple) heap_toast_insert_or_update_generic(rel, newtup, oldtup, NULL, options, false);
+	return (HeapTuple) heap_toast_insert_or_update_generic(rel, newtup, oldtup, NULL, options, TOAST_TUPLE_TARGET, false);
 }
 
 MemTuple memtup_toast_insert_or_update(Relation rel, MemTuple newtup, MemTuple oldtup,
-									   MemTupleBinding *pbind, int options)
+									   MemTupleBinding *pbind, int toast_tuple_target, int options)
 {
-	return (MemTuple) heap_toast_insert_or_update_generic(rel, newtup, oldtup, pbind, options, true);
+	return (MemTuple) heap_toast_insert_or_update_generic(rel, newtup, oldtup, pbind, toast_tuple_target, options, true);
 }
 
 static void *
 heap_toast_insert_or_update_generic(Relation rel, void *newtup, void *oldtup,
-									MemTupleBinding *pbind, int options, bool ismemtuple)
+									MemTupleBinding *pbind, int options, int toast_tuple_target, bool ismemtuple)
 {
 	void		*result_gtuple;
 	TupleDesc	tupleDesc;
@@ -210,10 +210,8 @@ heap_toast_insert_or_update_generic(Relation rel, void *newtup, void *oldtup,
 	}
 	else
 	{
-		/* Since reloptions for AO table is not permitted, so using TOAST_TUPLE_TARGET */
-		hoff = sizeof(MemTupleData);
-		hoff = MAXALIGN(hoff);
-		maxDataLen = TOAST_TUPLE_TARGET - hoff;
+		maxDataLen = toast_tuple_target;
+		hoff = -1; /* keep compiler quiet about using 'hoff' uninitialized */
 	}
 
 	/*
@@ -295,7 +293,15 @@ heap_toast_insert_or_update_generic(Relation rel, void *newtup, void *oldtup,
 	 * increase the target tuple size, so that MAIN attributes aren't stored
 	 * externally unless really necessary.
 	 */
-	maxDataLen = TOAST_TUPLE_TARGET_MAIN - hoff;
+	/*
+	 * FIXME: Should we do something like this with memtuples on
+	 * AO tables too? Currently we do not increase the target tuple size for AO
+	 * table, so there are occasions when columns of type 'm' will be stored
+	 * out-of-line but they could otherwise be accommodated in-block
+	 * c.f. upstream Postgres commit ca7c8168de76459380577eda56a3ed09b4f6195c
+	 */
+	if (!ismemtuple)
+		maxDataLen = TOAST_TUPLE_TARGET_MAIN - hoff;
 
 	while (heap_compute_data_size(tupleDesc,
 								  toast_values, toast_isnull) > maxDataLen &&

diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
@@ -3487,17 +3487,19 @@ SetTempNamespaceState(Oid tempNamespaceId, Oid tempToastNamespaceId)
 /*
  * like SetTempNamespaceState, but the process running normally
  *
- * GPDB: used to set session level temporary namespace after gang launched.
+ * GPDB: used to set session level temporary namespace after reader gang launched.
  */
 void
 SetTempNamespaceStateAfterBoot(Oid tempNamespaceId, Oid tempToastNamespaceId)
 {
-	/* same as PG, can not switch to other temp namespace dynamically */
-	Assert(myTempNamespace == InvalidOid || myTempNamespace == tempNamespaceId);
-	Assert(myTempToastNamespace == InvalidOid || myTempToastNamespace == tempToastNamespaceId);
+	Assert(Gp_role == GP_ROLE_EXECUTE);
 
-	/* if the namespace OID already setted, baseSearchPath is still valid */
-	if (myTempNamespace == tempToastNamespaceId && myTempToastNamespace == tempToastNamespaceId)
+	/* writer gang will do InitTempTableNamespace(), ignore the dispatch on writer gang */
+	if (Gp_is_writer)
+		return;
+
+	/* skip rebuild search path if search path is correct and valid */
+	if (tempNamespaceId == myTempNamespace && myTempToastNamespace == tempToastNamespaceId)
 		return;
 
 	myTempNamespace = tempNamespaceId;

diff --git a/src/backend/cdb/dispatcher/cdbdisp_query.c b/src/backend/cdb/dispatcher/cdbdisp_query.c
@@ -1021,7 +1021,7 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms,
 		pos += resgroupInfo.len;
 	}
 
-	/* in-process variable for temporary namespace */
+	/* pass process local variables to QEs */
 	GetTempNamespaceState(&tempNamespaceId, &tempToastNamespaceId);
 	tempNamespaceId = htonl(tempNamespaceId);
 	tempToastNamespaceId = htonl(tempToastNamespaceId);

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
@@ -17888,7 +17888,7 @@ build_ctas_with_dist(Relation rel, DistributedBy *dist_clause,
 
 	pre_built = prebuild_temp_table(rel, tmprel, dist_clause,
 									get_am_name(rel->rd_rel->relam),
-									storage_opts,
+									RelationIsAppendOptimized(rel) ? build_ao_rel_storage_opts(NIL, rel) : storage_opts,
 									RelationIsAppendOptimized(rel),
 									useExistingColumnAttributes);
 	if (pre_built)
@@ -17912,6 +17912,19 @@ build_ctas_with_dist(Relation rel, DistributedBy *dist_clause,
 		into->options = storage_opts;
 		into->tableSpaceName = get_tablespace_name(tblspc);
 		into->distributedBy = (Node *)dist_clause;
+		if (RelationIsAppendOptimized(rel))
+		{
+			/*
+			 * In order to avoid being affected by the GUC of gp_default_storage_options,
+			 * we should re-build storage options from original table.
+			 *
+			 * The reason is that when we use the default parameters to create a table,
+			 * the configuration will not be written to pg_class.reloptions, and then if
+			 * gp_default_storage_options is modified, the newly created table will be
+			 * inconsistent with the original table.
+			 */
+			into->options = build_ao_rel_storage_opts(into->options, rel);
+		}
 		s->intoClause = into;
 
 		RawStmt *rawstmt = makeNode(RawStmt);

diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
@@ -5722,7 +5722,7 @@ PostgresMain(int argc, char *argv[],
 					if (resgroupInfoLen > 0)
 						resgroupInfoBuf = pq_getmsgbytes(&input_message, resgroupInfoLen);
 
-					/* in-process variable for temporary namespace */
+					/* process local variables for temporary namespace */
 					{
 						Oid			tempNamespaceId, tempToastNamespaceId;
 

diff --git a/src/include/access/heaptoast.h b/src/include/access/heaptoast.h
@@ -104,7 +104,7 @@
  */
 extern HeapTuple heap_toast_insert_or_update(Relation rel, HeapTuple newtup, HeapTuple oldtup, int options);
 extern MemTuple memtup_toast_insert_or_update(Relation rel, MemTuple newtup, MemTuple oldtup,
-											  MemTupleBinding *pbind, int options);
+											  MemTupleBinding *pbind, int toast_tuple_target, int options);
 
 /* ----------
  * heap_toast_delete -

diff --git a/src/include/access/reloptions.h b/src/include/access/reloptions.h
@@ -312,6 +312,8 @@ List* transfromColumnEncodingAocoRootPartition(List *colDefs, List *stenc, List
 extern List *transformStorageEncodingClause(List *options, bool validate);
 extern List *form_default_storage_directive(List *enc);
 extern bool is_storage_encoding_directive(const char *name);
+extern bool reloptions_has_opt(List *opts, const char *name);
+extern List *build_ao_rel_storage_opts(List *opts, Relation rel);
 
 extern relopt_value *
 parseRelOptions(Datum options, bool validate, relopt_kind kind, int *numrelopts);

diff --git a/src/test/regress/expected/bfv_temp.out b/src/test/regress/expected/bfv_temp.out
@@ -99,3 +99,31 @@ drop table tn_b_b;
 drop table tn_b_temp;
 drop table tn_b_new;
 drop function fun(sql text, a oid);
+-- Chek if error out inside UDF, myTempNamespace will roll back
+\c
+create or replace function errored_udf() returns int[] as 'BEGIN RAISE EXCEPTION ''AAA''; END' language plpgsql;
+create table n as select from generate_series(1, 10);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named '' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+select count(*) from n n1, n n2; -- boot reader gang
+ count 
+-------
+   100
+(1 row)
+
+create temp table nn as select errored_udf() from n;
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'errored_udf' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+ERROR:  AAA  (seg0 slice1 127.0.0.1:7002 pid=123850)
+CONTEXT:  PL/pgSQL function errored_udf() line 1 at RAISE
+create temp table nnn as select * from generate_series(1, 10); -- check if reader do the rollback. should OK
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column(s) named 'generate_series' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+select count(*) from nnn n1, nnn n2; -- check if reader can read temp table. should OK
+ count 
+-------
+   100
+(1 row)
+
+drop table n;
+drop function errored_udf();
diff --git a/src/test/regress/expected/bitmap_index.out b/src/test/regress/expected/bitmap_index.out
@@ -1088,3 +1088,23 @@ select * from bm_test where b = 1;
 
 -- clean up
 drop table bm_test;
+-- test the scenario that we need read the same batch words many times
+-- more detials can be found at https://github.com/greenplum-db/gpdb/issues/13446
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+create table foo_13446(a int, b int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create index idx_13446 on foo_13446 using bitmap(b);
+insert into foo_13446 select 1, 1 from generate_series(0, 16384);
+-- At current implementation, BMIterateResult can only store 16*1024=16384 TIDs,
+-- if we have 13685 TIDs to read, it must scan same batch words twice, that's what we want
+select count(*) from foo_13446 where b = 1;
+ count 
+-------
+ 16385
+(1 row)
+
+drop table foo_13446;
+SET enable_seqscan = ON;
+SET enable_bitmapscan = ON;
diff --git a/src/test/regress/expected/bitmap_index_optimizer.out b/src/test/regress/expected/bitmap_index_optimizer.out
@@ -1099,3 +1099,23 @@ select * from bm_test where b = 1;
 
 -- clean up
 drop table bm_test;
+-- test the scenario that we need read the same batch words many times
+-- more detials can be found at https://github.com/greenplum-db/gpdb/issues/13446
+SET enable_seqscan = OFF;
+SET enable_bitmapscan = OFF;
+create table foo_13446(a int, b int);
+NOTICE:  Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Greenplum Database data distribution key for this table.
+HINT:  The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
+create index idx_13446 on foo_13446 using bitmap(b);
+insert into foo_13446 select 1, 1 from generate_series(0, 16384);
+-- At current implementation, BMIterateResult can only store 16*1024=16384 TIDs,
+-- if we have 13685 TIDs to read, it must scan same batch words twice, that's what we want
+select count(*) from foo_13446 where b = 1;
+ count 
+-------
+ 16385
+(1 row)
+
+drop table foo_13446;
+SET enable_seqscan = ON;
+SET enable_bitmapscan = ON;