Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(storage): support mutation during insertion #8205

Merged
merged 15 commits into from
Oct 19, 2022
7 changes: 5 additions & 2 deletions src/query/catalog/src/table_mutator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::sync::Arc;

use common_exception::Result;
use common_meta_app::schema::TableInfo;

use crate::table::Table;

#[async_trait::async_trait]
pub trait TableMutator: Send + Sync {
async fn blocks_select(&mut self) -> Result<bool>;
async fn try_commit(&self, table_info: &TableInfo) -> Result<()>;
async fn try_commit(&self, table: Arc<dyn Table>) -> Result<()>;
}
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,14 @@ impl Interpreter for OptimizeTableInterpreter {
executor.execute()?;
drop(executor);

mutator.try_commit(table.get_table_info()).await?;
// "refresh" the table by using the catalog API directly
table = self
.ctx
.get_catalog(&plan.catalog)?
.get_table(ctx.get_tenant().as_str(), &plan.database, &plan.table)
.await?;

mutator.try_commit(table.clone()).await?;
}

if do_purge {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,13 @@ impl Interpreter for ReclusterTableInterpreter {
executor.execute()?;
drop(executor);

mutator.try_commit(table.get_table_info()).await?;
// refresh table
let table = self
.ctx
.get_catalog(&plan.catalog)?
.get_table(tenant.as_str(), &plan.database, &plan.table)
.await?;
mutator.try_commit(table).await?;

if !plan.is_final {
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,13 @@ async fn test_deletion_mutator_multiple_empty_segments() -> Result<()> {
}
}

let new_snapshot = mutator.into_new_snapshot().await?;
let (segments, _, _) = mutator.generate_segments().await?;

// half segments left after deletion
assert_eq!(new_snapshot.segments.len(), 50);
assert_eq!(segments.len(), 50);

// new_segments should be a subset of test_segments in our case (no partial deletion of segment)
let new_segments = HashSet::<_, RandomState>::from_iter(new_snapshot.segments.into_iter());
let new_segments = HashSet::<_, RandomState>::from_iter(segments.into_iter());
let test_segments = HashSet::from_iter(test_segment_locations.into_iter());
assert!(new_segments.is_subset(&test_segments));

Expand Down
58 changes: 58 additions & 0 deletions src/query/storages/fuse/src/operations/commit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ use crate::io::TableMetaLocationGenerator;
use crate::operations::AppendOperationLogEntry;
use crate::operations::TableOperationLog;
use crate::statistics;
use crate::statistics::merge_statistics;
use crate::FuseSegmentIO;
use crate::FuseTable;
use crate::OPT_KEY_LEGACY_SNAPSHOT_LOC;
use crate::OPT_KEY_SNAPSHOT_LOCATION;
Expand Down Expand Up @@ -381,6 +383,59 @@ impl FuseTable {
warn!("write last snapshot hint failure. {}", e);
})
}

pub async fn commit_mutation(
&self,
ctx: Arc<dyn TableContext>,
base_snapshot: Arc<TableSnapshot>,
mut segments: Vec<Location>,
mut summary: Statistics,
) -> Result<()> {
let snapshot_opt = self.read_table_snapshot(ctx.clone()).await?;
let latest_snapshot = snapshot_opt.ok_or_else(|| {
ErrorCode::UnknownException("Data mutates during operation".to_string())
})?;
if latest_snapshot.snapshot_id != base_snapshot.snapshot_id {
if latest_snapshot.segments.len() < base_snapshot.segments.len() {
return Err(ErrorCode::UnknownException(
"Data mutates during operation".to_string(),
));
}

// Check if there is only insertion during the operation.
let mut new_segments = latest_snapshot.segments.clone();
let suffix = new_segments
.split_off(latest_snapshot.segments.len() - base_snapshot.segments.len());
if suffix.ne(&base_snapshot.segments) {
return Err(ErrorCode::UnknownException(
"Data mutates during operation".to_string(),
));
}

// Read all segments information in parallel.
let fuse_segment_io = FuseSegmentIO::create(ctx.clone(), self.operator.clone());
let results = fuse_segment_io.read_segments(&new_segments).await?;
for result in results.iter() {
let segment = result.clone()?;
summary = merge_statistics(&summary, &segment.summary)?;
}
new_segments.extend(segments.clone());
segments = new_segments;
}

let mut new_snapshot = TableSnapshot::from_previous(&latest_snapshot);
new_snapshot.segments = segments;
new_snapshot.summary = summary;

Self::commit_to_meta_server(
dantengsky marked this conversation as resolved.
Show resolved Hide resolved
ctx.as_ref(),
&self.table_info,
&self.meta_location_generator,
new_snapshot,
&self.operator,
)
.await
}
}

mod utils {
Expand All @@ -398,6 +453,9 @@ mod utils {
// if deletion operation failed (after DAL retried)
// we just left them there, and let the "major GC" collect them
let _ = operator.object(block_location).delete().await;
if let Some(index) = &block.bloom_filter_index_location {
let _ = operator.object(&index.0).delete().await;
}
}
let _ = operator.object(&entry.segment_location).delete().await;
}
Expand Down
33 changes: 21 additions & 12 deletions src/query/storages/fuse/src/operations/delete.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

use std::sync::Arc;

use common_catalog::table::Table;
use common_catalog::table::TableExt;
use common_catalog::table_context::TableContext;
use common_datavalues::DataSchemaRefExt;
use common_exception::ErrorCode;
Expand Down Expand Up @@ -129,24 +129,33 @@ impl FuseTable {
}
}
}
self.commit_deletion(ctx.as_ref(), deletion_collector).await

self.commit_deletion(ctx, deletion_collector).await
}

async fn commit_deletion(
&self,
ctx: &dyn TableContext,
ctx: Arc<dyn TableContext>,
del_holder: DeletionMutator,
) -> Result<()> {
let new_snapshot = del_holder.into_new_snapshot().await?;
Self::commit_to_meta_server(
ctx,
self.get_table_info(),
&self.meta_location_generator,
new_snapshot,
&self.operator,
)
.await?;
let (segments, summary, abort_operation) = del_holder.generate_segments().await?;
// Refresh the table.
let latest = self.refresh(ctx.as_ref()).await?;
let table = FuseTable::try_from_table(latest.as_ref())?;

// TODO check if error is recoverable, and try to resolve the conflict
if let Err(e) = table
.commit_mutation(
ctx.clone(),
del_holder.base_snapshot().clone(),
segments,
summary,
)
.await
{
abort_operation.abort(self.operator.clone()).await;
return Err(e);
}
Ok(())
}

Expand Down
51 changes: 51 additions & 0 deletions src/query/storages/fuse/src/operations/mutation/abort_operation.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// Copyright 2022 Datafuse Labs.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

use common_fuse_meta::meta::BlockMeta;
use opendal::Operator;

#[derive(Default, Clone, Debug)]
pub struct AbortOperation {
pub segments: Vec<String>,
pub blocks: Vec<String>,
pub bloom_filter_indexes: Vec<String>,
}

impl AbortOperation {
pub fn add_block(mut self, block: &BlockMeta) -> Self {
let block_location = block.location.clone();
self.blocks.push(block_location.0);
if let Some(index) = block.bloom_filter_index_location.clone() {
self.bloom_filter_indexes.push(index.0);
}
self
}

pub fn add_segment(mut self, segment: String) -> Self {
self.segments.push(segment);
self
}

pub async fn abort(self, operator: Operator) {
for block in self.blocks {
zhyass marked this conversation as resolved.
Show resolved Hide resolved
let _ = operator.object(&block).delete().await;
}
for index in self.bloom_filter_indexes {
let _ = operator.object(&index).delete().await;
}
for segment in self.segments {
let _ = operator.object(&segment).delete().await;
}
}
}
22 changes: 7 additions & 15 deletions src/query/storages/fuse/src/operations/mutation/base_mutator.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ use common_fuse_meta::meta::Statistics;
use common_fuse_meta::meta::TableSnapshot;
use opendal::Operator;

use super::AbortOperation;
use crate::io::MetaReaders;
use crate::io::SegmentWriter;
use crate::io::TableMetaLocationGenerator;
Expand Down Expand Up @@ -81,19 +82,8 @@ impl BaseMutator {
});
}

pub async fn into_new_snapshot(
self,
segments: Vec<Location>,
summary: Statistics,
) -> Result<TableSnapshot> {
let snapshot = self.base_snapshot;
let mut new_snapshot = TableSnapshot::from_previous(&snapshot);
new_snapshot.segments = segments;
new_snapshot.summary = summary;
Ok(new_snapshot)
}

pub async fn generate_segments(&self) -> Result<(Vec<Location>, Statistics)> {
pub async fn generate_segments(&self) -> Result<(Vec<Location>, Statistics, AbortOperation)> {
let mut abort_operation = AbortOperation::default();
let segments = self.base_snapshot.segments.clone();
let mut segments_editor =
HashMap::<_, _, RandomState>::from_iter(segments.clone().into_iter().enumerate());
Expand Down Expand Up @@ -143,6 +133,7 @@ impl BaseMutator {
))
})?;
if let Some(block_meta) = replacement.new_block_meta {
abort_operation = abort_operation.add_block(&block_meta);
block_editor.insert(*position, block_meta);
} else {
block_editor.remove(position);
Expand All @@ -160,7 +151,8 @@ impl BaseMutator {
new_segment.summary = new_summary;
// write down new segment
let new_segment_location = seg_writer.write_segment(new_segment).await?;
segments_editor.insert(seg_idx, new_segment_location);
segments_editor.insert(seg_idx, new_segment_location.clone());
abort_operation = abort_operation.add_segment(new_segment_location.0);
}
}

Expand All @@ -175,6 +167,6 @@ impl BaseMutator {

// update the summary of new snapshot
let new_summary = reduce_statistics(&new_segment_summaries)?;
Ok((new_segments, new_summary))
Ok((new_segments, new_summary, abort_operation))
}
}
Loading