Skip to content

Commit

Permalink
Skip unused bin packing in IcebergSplitSource
Browse files Browse the repository at this point in the history
Cherry-pick of trinodb/trino@638d7f7

The Iceberg planTasks method buckets small files together
into combined scan tasks, but these combined tasks are not
used. Instead just plan individual FileScanTasks with the
target size.

Co-authored-by: Alex Jo <jo.alex2144@gmail.com>
  • Loading branch information
2 people authored and highker committed Jul 1, 2022
1 parent 629f4da commit 03d9533
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import com.google.common.collect.ImmutableList;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableScan;
import org.apache.iceberg.util.TableScanUtil;

import javax.inject.Inject;

Expand Down Expand Up @@ -85,7 +86,7 @@ public ConnectorSplitSource getSplits(

// TODO Use residual. Right now there is no way to propagate residual to presto but at least we can
// propagate it at split level so the parquet pushdown can leverage it.
IcebergSplitSource splitSource = new IcebergSplitSource(session, tableScan.planTasks());
IcebergSplitSource splitSource = new IcebergSplitSource(session, TableScanUtil.splitFiles(tableScan.planFiles(), tableScan.targetSplitSize()));
return splitSource;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,19 @@
import com.facebook.presto.spi.ConnectorSplitSource;
import com.facebook.presto.spi.connector.ConnectorPartitionHandle;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Streams;
import org.apache.iceberg.CombinedScanTask;
import com.google.common.io.Closer;
import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.PartitionField;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.StructLike;
import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.io.CloseableIterator;
import org.apache.iceberg.types.Type;

import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
Expand All @@ -51,26 +50,27 @@
public class IcebergSplitSource
implements ConnectorSplitSource
{
private final CloseableIterable<CombinedScanTask> combinedScanIterable;
private final Iterator<FileScanTask> fileScanIterator;
private final Closer closer = Closer.create();

private CloseableIterable<FileScanTask> fileScanTaskIterable;
private CloseableIterator<FileScanTask> fileScanTaskIterator;
private final ConnectorSession session;

public IcebergSplitSource(ConnectorSession session, CloseableIterable<CombinedScanTask> combinedScanIterable)
public IcebergSplitSource(ConnectorSession session, CloseableIterable<FileScanTask> fileScanTaskIterable)
{
this.session = requireNonNull(session, "session is null");
this.combinedScanIterable = requireNonNull(combinedScanIterable, "combinedScanIterable is null");
this.fileScanIterator = Streams.stream(combinedScanIterable)
.map(CombinedScanTask::files)
.flatMap(Collection::stream)
.iterator();
this.fileScanTaskIterable = requireNonNull(fileScanTaskIterable, "combinedScanIterable is null");
this.fileScanTaskIterator = fileScanTaskIterable.iterator();
closer.register(fileScanTaskIterable);
closer.register(fileScanTaskIterator);
}

@Override
public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorPartitionHandle partitionHandle, int maxSize)
{
// TODO: move this to a background thread
List<ConnectorSplit> splits = new ArrayList<>();
Iterator<FileScanTask> iterator = limit(fileScanIterator, maxSize);
Iterator<FileScanTask> iterator = limit(fileScanTaskIterator, maxSize);
while (iterator.hasNext()) {
FileScanTask task = iterator.next();
splits.add(toIcebergSplit(task));
Expand All @@ -81,14 +81,14 @@ public CompletableFuture<ConnectorSplitBatch> getNextBatch(ConnectorPartitionHan
@Override
public boolean isFinished()
{
return !fileScanIterator.hasNext();
return !fileScanTaskIterator.hasNext();
}

@Override
public void close()
{
try {
combinedScanIterable.close();
closer.close();
}
catch (IOException e) {
throw new UncheckedIOException(e);
Expand Down

0 comments on commit 03d9533

Please sign in to comment.