From febac7b125c1ec79a3cd4de59b5f1eeb83f549b5 Mon Sep 17 00:00:00 2001 From: baishen Date: Fri, 7 Jun 2024 22:15:32 +0800 Subject: [PATCH] feat(query): add inverted index size to block meta (#15752) * feat(query): add inverted index size to block meta * fix --- src/query/service/src/test_kits/block_writer.rs | 1 + .../tests/it/storages/fuse/bloom_index_meta_size.rs | 4 +--- .../fuse/operations/mutation/recluster_mutator.rs | 1 + .../operations/mutation/segments_compact_mutator.rs | 1 + .../tests/it/storages/fuse/operations/read_plan.rs | 1 + src/query/service/tests/it/storages/fuse/statistics.rs | 1 + .../storages/common/table_meta/src/meta/v2/segment.rs | 5 +++++ .../common/table_meta/src/meta/v3/frozen/block_meta.rs | 1 + src/query/storages/fuse/src/io/write/block_writer.rs | 7 +++++++ src/query/storages/fuse/src/statistics/reducers.rs | 1 + .../fuse/src/table_functions/fuse_blocks/fuse_block.rs | 10 ++++++++++ .../04_0000_inverted_index_base.test | 5 +++++ 12 files changed, 35 insertions(+), 3 deletions(-) diff --git a/src/query/service/src/test_kits/block_writer.rs b/src/query/service/src/test_kits/block_writer.rs index 2654c246f5c2..8fff99b506f2 100644 --- a/src/query/service/src/test_kits/block_writer.rs +++ b/src/query/service/src/test_kits/block_writer.rs @@ -90,6 +90,7 @@ impl<'a> BlockWriter<'a> { location, bloom_filter_index_location, bloom_filter_index_size, + None, Compression::Lz4Raw, Some(Utc::now()), ); diff --git a/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs b/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs index bdb737adff2e..8b1eaff52723 100644 --- a/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs +++ b/src/query/service/tests/it/storages/fuse/bloom_index_meta_size.rs @@ -17,8 +17,6 @@ use std::collections::HashSet; use std::sync::Arc; use chrono::Utc; -// use databend_common_arrow::parquet::metadata::FileMetaData; -// use databend_common_arrow::parquet::metadata::ThriftFileMetaData; use databend_common_base::base::tokio; use databend_common_cache::Cache; use databend_common_expression::types::Int32Type; @@ -38,7 +36,6 @@ use databend_common_storages_fuse::FuseStorageFormat; use databend_query::test_kits::*; use databend_storages_common_cache::InMemoryCacheBuilder; use databend_storages_common_cache::InMemoryItemCacheHolder; -// use databend_storages_common_index::BloomIndexMeta; use databend_storages_common_table_meta::meta::BlockMeta; use databend_storages_common_table_meta::meta::ColumnMeta; use databend_storages_common_table_meta::meta::ColumnStatistics; @@ -340,6 +337,7 @@ fn build_test_segment_info( location: block_location, bloom_filter_index_location: Some(location_gen.block_bloom_index_location(&block_uuid)), bloom_filter_index_size: 0, + inverted_index_size: None, compression: Compression::Lz4, create_on: Some(Utc::now()), }; diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs index a2e6c9dd659e..eee18e656d2f 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/recluster_mutator.rs @@ -75,6 +75,7 @@ async fn test_recluster_mutator_block_select() -> Result<()> { location.clone(), None, 0, + None, meta::Compression::Lz4Raw, Some(Utc::now()), )); diff --git a/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs b/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs index 2f8ee2261e12..d910915cc9ce 100644 --- a/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs +++ b/src/query/service/tests/it/storages/fuse/operations/mutation/segments_compact_mutator.rs @@ -774,6 +774,7 @@ impl CompactSegmentTestFixture { location, None, 0, + None, Compression::Lz4Raw, Some(Utc::now()), ); diff --git a/src/query/service/tests/it/storages/fuse/operations/read_plan.rs b/src/query/service/tests/it/storages/fuse/operations/read_plan.rs index 53f75b3801db..ae769f50dc25 100644 --- a/src/query/service/tests/it/storages/fuse/operations/read_plan.rs +++ b/src/query/service/tests/it/storages/fuse/operations/read_plan.rs @@ -99,6 +99,7 @@ fn test_to_partitions() -> Result<()> { location, bloom_filter_location, bloom_filter_size, + None, meta::Compression::Lz4Raw, Some(Utc::now()), )); diff --git a/src/query/service/tests/it/storages/fuse/statistics.rs b/src/query/service/tests/it/storages/fuse/statistics.rs index bea0c9224561..246b2799658d 100644 --- a/src/query/service/tests/it/storages/fuse/statistics.rs +++ b/src/query/service/tests/it/storages/fuse/statistics.rs @@ -624,6 +624,7 @@ fn test_reduce_block_meta() -> databend_common_exception::Result<()> { location.clone(), None, bloom_filter_index_size, + None, Compression::Lz4Raw, Some(Utc::now()), ); diff --git a/src/query/storages/common/table_meta/src/meta/v2/segment.rs b/src/query/storages/common/table_meta/src/meta/v2/segment.rs index 532fbfb46a08..b050a5377cac 100644 --- a/src/query/storages/common/table_meta/src/meta/v2/segment.rs +++ b/src/query/storages/common/table_meta/src/meta/v2/segment.rs @@ -77,6 +77,7 @@ pub struct BlockMeta { #[serde(default)] pub bloom_filter_index_size: u64, + pub inverted_index_size: Option, pub compression: Compression, // block create_on @@ -95,6 +96,7 @@ impl BlockMeta { location: Location, bloom_filter_index_location: Option, bloom_filter_index_size: u64, + inverted_index_size: Option, compression: Compression, create_on: Option>, ) -> Self { @@ -108,6 +110,7 @@ impl BlockMeta { location, bloom_filter_index_location, bloom_filter_index_size, + inverted_index_size, compression, create_on, } @@ -255,6 +258,7 @@ impl BlockMeta { bloom_filter_index_location: None, bloom_filter_index_size: 0, compression: Compression::Lz4, + inverted_index_size: None, create_on: None, } } @@ -287,6 +291,7 @@ impl BlockMeta { bloom_filter_index_location: s.bloom_filter_index_location.clone(), bloom_filter_index_size: s.bloom_filter_index_size, compression: s.compression, + inverted_index_size: None, create_on: None, } } diff --git a/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs b/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs index f778fbaebb5d..aebed5d56ba4 100644 --- a/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs +++ b/src/query/storages/common/table_meta/src/meta/v3/frozen/block_meta.rs @@ -61,6 +61,7 @@ impl From for crate::meta::BlockMeta { location: value.location, bloom_filter_index_location: value.bloom_filter_index_location, bloom_filter_index_size: value.bloom_filter_index_size, + inverted_index_size: None, compression: value.compression.into(), create_on: None, } diff --git a/src/query/storages/fuse/src/io/write/block_writer.rs b/src/query/storages/fuse/src/io/write/block_writer.rs index 64a1733fd956..6fe9967f1b1c 100644 --- a/src/query/storages/fuse/src/io/write/block_writer.rs +++ b/src/query/storages/fuse/src/io/write/block_writer.rs @@ -381,6 +381,12 @@ impl BlockBuilder { &mut buffer, )?; let file_size = buffer.len() as u64; + let inverted_index_size = if !inverted_index_states.is_empty() { + let size = inverted_index_states.iter().map(|v| v.size).sum(); + Some(size) + } else { + None + }; let block_meta = BlockMeta { row_count, block_size, @@ -395,6 +401,7 @@ impl BlockBuilder { .map(|v| v.size) .unwrap_or_default(), compression: self.write_settings.table_compression.into(), + inverted_index_size, create_on: Some(Utc::now()), }; diff --git a/src/query/storages/fuse/src/statistics/reducers.rs b/src/query/storages/fuse/src/statistics/reducers.rs index f6db36560e30..3229497bfc69 100644 --- a/src/query/storages/fuse/src/statistics/reducers.rs +++ b/src/query/storages/fuse/src/statistics/reducers.rs @@ -211,6 +211,7 @@ pub fn reduce_block_metas>( uncompressed_byte_size += b.block_size; compressed_byte_size += b.file_size; index_size += b.bloom_filter_index_size; + index_size += b.inverted_index_size.unwrap_or_default(); if thresholds.check_large_enough(b.row_count as usize, b.block_size as usize) || b.cluster_stats.as_ref().is_some_and(|v| v.level != 0) { diff --git a/src/query/storages/fuse/src/table_functions/fuse_blocks/fuse_block.rs b/src/query/storages/fuse/src/table_functions/fuse_blocks/fuse_block.rs index 65653201fa7c..c0658d350e35 100644 --- a/src/query/storages/fuse/src/table_functions/fuse_blocks/fuse_block.rs +++ b/src/query/storages/fuse/src/table_functions/fuse_blocks/fuse_block.rs @@ -111,6 +111,7 @@ impl<'a> FuseBlock<'a> { let mut row_count = Vec::with_capacity(len); let mut bloom_filter_location = vec![]; let mut bloom_filter_size = Vec::with_capacity(len); + let mut inverted_index_size = Vec::with_capacity(len); let segments_io = SegmentsIO::create( self.ctx.clone(), @@ -143,6 +144,7 @@ impl<'a> FuseBlock<'a> { .map(|s| s.0.clone()), ); bloom_filter_size.push(block.bloom_filter_index_size); + inverted_index_size.push(block.inverted_index_size); row_num += 1; if row_num >= limit { @@ -188,6 +190,10 @@ impl<'a> FuseBlock<'a> { DataType::Number(NumberDataType::UInt64), Value::Column(UInt64Type::from_data(bloom_filter_size)), ), + BlockEntry::new( + DataType::Nullable(Box::new(DataType::Number(NumberDataType::UInt64))), + Value::Column(UInt64Type::from_opt_data(inverted_index_size)), + ), ], row_num, )) @@ -209,6 +215,10 @@ impl<'a> FuseBlock<'a> { "bloom_filter_size", TableDataType::Number(NumberDataType::UInt64), ), + TableField::new( + "inverted_index_size", + TableDataType::Nullable(Box::new(TableDataType::Number(NumberDataType::UInt64))), + ), ]) } } diff --git a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test index 05a3b1e10a33..06637da50e60 100644 --- a/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test +++ b/tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test @@ -429,6 +429,11 @@ idx INVERTED t1(body)tokenizer='chinese' idx1 INVERTED t(content)index_record='"basic"' tokenizer='chinese' idx2 INVERTED books(title, author, description)index_record='"basic"' tokenizer='chinese' +query III +select row_count, bloom_filter_size, inverted_index_size from fuse_block('test_index', 't1') +---- +10 465 3534 + statement ok use default