Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix UBSan failure in bulk text decompression #6583

Merged
merged 1 commit into from
Feb 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions .github/workflows/libfuzzer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
# leading to a tainted cache
- name: Cache PostgreSQL
id: cache-postgresql
uses: actions/cache@v3
uses: actions/cache@v4
with:
path: ~/${{ env.PG_SRC_DIR }}
key: "postgresql-libfuzzer-${{ steps.get-date.outputs.date }}-${{ hashFiles('.github/**') }}"
Expand Down Expand Up @@ -145,7 +145,7 @@ jobs:
uses: actions/checkout@v4

- name: Download the installation directory
uses: actions/download-artifact@v3
uses: actions/download-artifact@v4
with:
name: fuzzing-install-dir

Expand Down Expand Up @@ -240,9 +240,6 @@ jobs:
exit 1
fi

# Check that the server is still alive.
psql -c "select 1"

ls db/corpus | wc -l

fn="ts_read_compressed_data_directory('${{ matrix.case.algo }}',
Expand All @@ -257,7 +254,7 @@ jobs:
# Save interesting cases because the caches are not available for download from UI
mkdir -p interesting
psql -qtAX -c "select distinct on (location) 'db/' || path from $fn
order by location, bytes
order by location, bytes, path
" | xargs cp -t interesting

# Check that we don't have any internal errors
Expand All @@ -267,10 +264,12 @@ jobs:
echo "Internal program errors: $errors"
[ $errors -eq 0 ] || exit 1


# Shouldn't have any WARNINGS in the log.
! grep -F "] WARNING: " postgres.log

# Check that the server is still alive.
psql -c "select 1"

- name: Collect the logs
if: always()
id: collectlogs
Expand Down
11 changes: 9 additions & 2 deletions .github/workflows/sanitizer-build-and-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -204,19 +204,26 @@ jobs:
./scripts/bundle_coredumps.sh
grep -C40 "was terminated by signal" postgres.log > postgres-failure.log ||:

- name: Show sanitizer logs
if: always()
run: |
tail -vn +1 sanitizer* || :

- name: Coredumps
if: always() && steps.collectlogs.outputs.coredumps == 'true'
uses: actions/upload-artifact@v4
with:
name: Coredumps ${{ matrix.os }} ${{ env.name }} ${{ matrix.pg }}
path: coredumps

- name: sanitizer logs
- name: Upload sanitizer logs
if: always()
uses: actions/upload-artifact@v4
with:
name: sanitizer logs ${{ matrix.os }} ${{ env.name }} ${{ matrix.pg }}
path: ${{ github.workspace }}/sanitizer
# The log_path sanitizer option means "Write logs to 'log_path.pid'".
# https://github.com/google/sanitizers/wiki/SanitizerCommonFlags
path: ${{ github.workspace }}/sanitizer*

- name: Upload test results to the database
if: always()
Expand Down
3 changes: 2 additions & 1 deletion scripts/upload_ci_stats.sh
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,8 @@ do
done

# Upload the logs.
for x in sanitizer/* {sqlsmith/sqlsmith,sanitizer,stacktrace,postgres-failure}.log *.diff
# Note that the sanitizer setting log_path means "write logs to 'log_path.pid'".
for x in sanitizer* sanitizer/* {sqlsmith/sqlsmith,sanitizer,stacktrace,postgres-failure}.log *.diff
do
if ! [ -e "$x" ]; then continue ; fi
"${PSQL[@]}" <<<"
Expand Down
64 changes: 52 additions & 12 deletions tsl/src/compression/array.c
Original file line number Diff line number Diff line change
Expand Up @@ -493,12 +493,18 @@ text_array_decompress_all_serialized_no_header(StringInfo si, bool has_nulls,

Simple8bRleSerialized *sizes_serialized = bytes_deserialize_simple8b_and_advance(si);

uint32 sizes[GLOBAL_MAX_ROWS_PER_COMPRESSION];
/*
* We need a quite significant padding of 63 elements, not bytes, after the
* last element, because we work in Simple8B blocks which can contain up to
* 64 elements.
*/
uint32 sizes[GLOBAL_MAX_ROWS_PER_COMPRESSION + 63];
const uint16 n_notnull =
simple8brle_decompress_all_buf_uint32(sizes_serialized,
sizes,
sizeof(sizes) / sizeof(sizes[0]));
const int n_total = has_nulls ? nulls_serialized->num_elements : n_notnull;
CheckCompressedData(n_total >= n_notnull);

uint32 *offsets =
(uint32 *) MemoryContextAllocZero(dest_mctx,
Expand All @@ -509,22 +515,56 @@ text_array_decompress_all_serialized_no_header(StringInfo si, bool has_nulls,
uint32 offset = 0;
for (int i = 0; i < n_notnull; i++)
{
void *vardata = consumeCompressedData(si, sizes[i]);
void *unaligned = consumeCompressedData(si, sizes[i]);

/*
* We start reading from the end of previous datum, but this pointer
* might be not aligned as required for varlena-4b struct. We have to
* align it here. Note that sizes[i] includes the alignment as well in
* addition to the varlena size.
*
* See the corresponding row-by-row code in bytes_to_datum_and_advance().
*/
void *vardata = DatumGetPointer(att_align_pointer(unaligned, TYPALIGN_INT, -1, unaligned));

/*
* Check for potentially corrupt varlena headers since we're reading them
* directly from compressed data. We can only have a plain datum
* with 1-byte or 4-byte header here, no TOAST or compressed data.
* directly from compressed data.
*/
CheckCompressedData(VARATT_IS_4B_U(vardata) ||
(VARATT_IS_1B(vardata) && !VARATT_IS_1B_E(vardata)));
if (VARATT_IS_4B_U(vardata))
{
/*
* Full varsize must be larger or equal than the header size so that
* the calculation of size without header doesn't overflow.
*/
CheckCompressedData(VARSIZE_4B(vardata) >= VARHDRSZ);
}
else if (VARATT_IS_1B(vardata))
{
/* Can't have a TOAST pointer here. */
CheckCompressedData(!VARATT_IS_1B_E(vardata));

/*
* Full varsize must be larger or equal than the header size so that
* the calculation of size without header doesn't overflow.
*/
CheckCompressedData(VARSIZE_1B(vardata) >= VARHDRSZ_SHORT);
}
else
{
/*
* Can only have an uncompressed datum with 1-byte or 4-byte header
* here, no TOAST or compressed data.
*/
CheckCompressedData(false);
}

/*
* Full varsize must be larger or equal than the header size so that the
* calculation of size without header doesn't overflow.
* Size of varlena plus alignment must match the size stored in the
* sizes array for this element.
*/
CheckCompressedData((VARATT_IS_1B(vardata) && VARSIZE_1B(vardata) >= VARHDRSZ_SHORT) ||
(VARSIZE_4B(vardata) >= VARHDRSZ));
/* Varsize must match the size stored in the sizes array for this element. */
CheckCompressedData(VARSIZE_ANY(vardata) == sizes[i]);
const Datum alignment_bytes = PointerGetDatum(vardata) - PointerGetDatum(unaligned);
CheckCompressedData(VARSIZE_ANY(vardata) + alignment_bytes == sizes[i]);

const uint32 textlen = VARSIZE_ANY_EXHDR(vardata);
memcpy(&arrow_bodies[offset], VARDATA_ANY(vardata), textlen);
Expand Down
2 changes: 1 addition & 1 deletion tsl/src/compression/compression.h
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ extern enum CompressionAlgorithms compress_get_default_algorithm(Oid typeoid);
*/
#ifndef TS_COMPRESSION_FUZZING
#define CORRUPT_DATA_MESSAGE(X) \
(errmsg("the compressed data is corrupt"), errdetail(X), errcode(ERRCODE_DATA_CORRUPTED))
(errmsg("the compressed data is corrupt"), errdetail("%s", X), errcode(ERRCODE_DATA_CORRUPTED))
#else
#define CORRUPT_DATA_MESSAGE(X) (errcode(ERRCODE_DATA_CORRUPTED))
#endif
Expand Down
1 change: 1 addition & 0 deletions tsl/src/compression/dictionary.c
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,7 @@ tsl_text_dictionary_decompress_all(Datum compressed, Oid element_type, MemoryCon

const uint16 n_notnull = indices_serialized->num_elements;
const uint16 n_total = header->has_nulls ? nulls_serialized->num_elements : n_notnull;
CheckCompressedData(n_total >= n_notnull);
const uint16 n_padded =
n_total + 63; /* This is the padding requirement of simple8brle_decompress_all. */
int16 *restrict indices = MemoryContextAlloc(dest_mctx, sizeof(int16) * n_padded);
Expand Down
4 changes: 2 additions & 2 deletions tsl/src/compression/gorilla_impl.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ FUNCTION_NAME(gorilla_decompress_all, ELEMENT_TYPE)(CompressedGorillaData *goril
const uint16 leading_zeros_padded =
unpack_leading_zeros_array(&gorilla_data->leading_zeros, all_leading_zeros);

uint8 bit_widths[MAX_NUM_LEADING_ZEROS_PADDED_N64];
uint8 bit_widths[GLOBAL_MAX_ROWS_PER_COMPRESSION + 63];
const uint16 num_bit_widths =
simple8brle_decompress_all_buf_uint8(gorilla_data->num_bits_used_per_xor,
bit_widths,
MAX_NUM_LEADING_ZEROS_PADDED_N64);
sizeof(bit_widths) / sizeof(bit_widths[0]));

BitArray xors_bitarray = gorilla_data->xors;
BitArrayIterator xors_iterator;
Expand Down
5 changes: 5 additions & 0 deletions tsl/src/compression/simple8b_rle_decompress_all.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ FUNCTION_NAME(simple8brle_decompress_all_buf,
{
const uint16 n_total_values = compressed->num_elements;

/*
* Caller must have allocated a properly sized buffer, see the comment above.
*/
Assert(n_buffer_elements >= n_total_values + 63);

const uint16 num_selector_slots =
simple8brle_num_selector_slots_for_num_blocks(compressed->num_blocks);
const uint16 num_blocks = compressed->num_blocks;
Expand Down
16 changes: 8 additions & 8 deletions tsl/test/expected/compression_algos.out
Original file line number Diff line number Diff line change
Expand Up @@ -1595,10 +1595,10 @@ group by 2, 3 order by 1 desc
;
count | bulk_result | rowbyrow_result
-------+-------------+-----------------
21 | XX001 | XX001
6 | 08P01 | 08P01
17 | XX001 | XX001
15 | true | true
7 | 08P01 | 08P01
2 | 3F000 | 3F000
2 | true | true
1 | 22021 | 22021
1 | false | false
(6 rows)
Expand All @@ -1613,11 +1613,11 @@ group by 2, 3 order by 1 desc
;
count | bulk_result | rowbyrow_result
-------+-------------+-----------------
51 | XX001 | XX001
4 | 08P01 | 08P01
4 | XX001 | true
2 | true | true
2 | 22021 | 22021
80 | XX001 | XX001
5 | 08P01 | 08P01
5 | XX001 | true
4 | true | true
3 | 22021 | 22021
1 | 3F000 | 3F000
1 | false | false
(7 rows)
Expand Down
68 changes: 68 additions & 0 deletions tsl/test/expected/decompress_vector_qual.out
Original file line number Diff line number Diff line change
Expand Up @@ -1053,3 +1053,71 @@ select * from date_table where ts < CURRENT_DATE;
01-01-2021
(3 rows)

-- Text columns. Only tests bulk decompression for now.
create table text_table(ts int, d int);
select create_hypertable('text_table', 'ts');
NOTICE: adding not-null constraint to column "ts"
create_hypertable
-------------------------
(9,public,text_table,t)
(1 row)

alter table text_table set (timescaledb.compress, timescaledb.compress_segmentby = 'd');
insert into text_table select x, 0 /*, default */ from generate_series(1, 1000) x;
select count(compress_chunk(x, true)) from show_chunks('text_table') x;
count
-------
1
(1 row)

alter table text_table add column a text default 'default';
insert into text_table select x, 1, '' from generate_series(1, 1000) x;
insert into text_table select x, 2, 'same' from generate_series(1, 1000) x;
insert into text_table select x, 3, 'different' || x from generate_series(1, 1000) x;
insert into text_table select x, 4, case when x % 2 = 0 then null else 'same-with-nulls' end from generate_series(1, 1000) x;
insert into text_table select x, 5, case when x % 2 = 0 then null else 'different-with-nulls' || x end from generate_series(1, 1000) x;
insert into text_table select x, 6, 'одинаковый' from generate_series(1, 1000) x;
insert into text_table select x, 7, '異なる' || x from generate_series(1, 1000) x;
-- Some text values with varying lengths in a single batch. They are all different
-- to prevent dictionary compression, because we want to test particular orders
-- here as well.
insert into text_table select x, 8, repeat( x::text || 'a', x) from generate_series(1, 100) x;
insert into text_table select x + 100, 8, repeat((101 - x)::text || 'b', (101 - x)) from generate_series(1, 100) x;
insert into text_table select x + 200, 8, repeat((101 - x)::text || 'c', (101 - x)) from generate_series(1, 100) x;
insert into text_table select x + 300, 8, repeat( x::text || 'd', x) from generate_series(1, 100) x;
set timescaledb.debug_require_vector_qual to 'forbid';
select sum(length(a)) from text_table;
sum
--------
118551
(1 row)

select count(distinct a) from text_table;
count
-------
2905
(1 row)

select count(compress_chunk(x, true)) from show_chunks('text_table') x;
NOTICE: chunk "_hyper_9_17_chunk" is already compressed
count
-------
1
(1 row)

select format('call recompress_chunk(''%s'')', x) from show_chunks('text_table') x \gexec
call recompress_chunk('_timescaledb_internal._hyper_9_17_chunk')
set timescaledb.enable_bulk_decompression to on;
set timescaledb.debug_require_vector_qual to 'forbid';
select sum(length(a)) from text_table;
sum
--------
118551
(1 row)

select count(distinct a) from text_table;
count
-------
2905
(1 row)

Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
37 changes: 37 additions & 0 deletions tsl/test/sql/decompress_vector_qual.sql
Original file line number Diff line number Diff line change
Expand Up @@ -352,3 +352,40 @@ select * from date_table where ts <= '2021-01-02';
select * from date_table where ts < '2021-01-02';
select * from date_table where ts < CURRENT_DATE;

-- Text columns. Only tests bulk decompression for now.
create table text_table(ts int, d int);
select create_hypertable('text_table', 'ts');
alter table text_table set (timescaledb.compress, timescaledb.compress_segmentby = 'd');

insert into text_table select x, 0 /*, default */ from generate_series(1, 1000) x;
select count(compress_chunk(x, true)) from show_chunks('text_table') x;
alter table text_table add column a text default 'default';

insert into text_table select x, 1, '' from generate_series(1, 1000) x;
insert into text_table select x, 2, 'same' from generate_series(1, 1000) x;
insert into text_table select x, 3, 'different' || x from generate_series(1, 1000) x;
insert into text_table select x, 4, case when x % 2 = 0 then null else 'same-with-nulls' end from generate_series(1, 1000) x;
insert into text_table select x, 5, case when x % 2 = 0 then null else 'different-with-nulls' || x end from generate_series(1, 1000) x;
insert into text_table select x, 6, 'одинаковый' from generate_series(1, 1000) x;
insert into text_table select x, 7, '異なる' || x from generate_series(1, 1000) x;

-- Some text values with varying lengths in a single batch. They are all different
-- to prevent dictionary compression, because we want to test particular orders
-- here as well.
insert into text_table select x, 8, repeat( x::text || 'a', x) from generate_series(1, 100) x;
insert into text_table select x + 100, 8, repeat((101 - x)::text || 'b', (101 - x)) from generate_series(1, 100) x;
insert into text_table select x + 200, 8, repeat((101 - x)::text || 'c', (101 - x)) from generate_series(1, 100) x;
insert into text_table select x + 300, 8, repeat( x::text || 'd', x) from generate_series(1, 100) x;

set timescaledb.debug_require_vector_qual to 'forbid';
select sum(length(a)) from text_table;
select count(distinct a) from text_table;

select count(compress_chunk(x, true)) from show_chunks('text_table') x;
select format('call recompress_chunk(''%s'')', x) from show_chunks('text_table') x \gexec

set timescaledb.enable_bulk_decompression to on;
set timescaledb.debug_require_vector_qual to 'forbid';

select sum(length(a)) from text_table;
select count(distinct a) from text_table;
12 changes: 11 additions & 1 deletion tsl/test/src/decompress_arithmetic_test_impl.c
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,17 @@
* 3) The bulk decompression must absolutely work on the correct compressed
* data we've just generated.
*/
arrow = decompress_all(compressed_data, PG_TYPE_OID, CurrentMemoryContext);
PG_TRY();
{
arrow = decompress_all(compressed_data, PG_TYPE_OID, CurrentMemoryContext);
}
PG_CATCH();

Check warning on line 207 in tsl/test/src/decompress_arithmetic_test_impl.c

View check run for this annotation

Codecov / codecov/patch

tsl/test/src/decompress_arithmetic_test_impl.c#L207

Added line #L207 was not covered by tests
{
EmitErrorReport();

Check warning on line 209 in tsl/test/src/decompress_arithmetic_test_impl.c

View check run for this annotation

Codecov / codecov/patch

tsl/test/src/decompress_arithmetic_test_impl.c#L209

Added line #L209 was not covered by tests
elog(PANIC, "bulk decompression failed for data that we've just compressed");
}
PG_END_TRY();

FUNCTION_NAME2(check_arrow, CTYPE)(arrow, PANIC, results, n);

return n;
Expand Down
Loading
Loading