diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveAnalyzeProperties.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveAnalyzeProperties.java new file mode 100644 index 0000000000000..0c374efd99dfe --- /dev/null +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveAnalyzeProperties.java @@ -0,0 +1,89 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.facebook.presto.hive; + +import com.facebook.presto.spi.PrestoException; +import com.facebook.presto.spi.session.PropertyMetadata; +import com.facebook.presto.spi.type.TypeManager; +import com.google.common.collect.ImmutableList; + +import javax.inject.Inject; + +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static com.facebook.presto.hive.HivePartitionKey.HIVE_DEFAULT_DYNAMIC_PARTITION; +import static com.facebook.presto.spi.StandardErrorCode.INVALID_ANALYZE_PROPERTY; +import static com.facebook.presto.spi.type.TypeSignature.parseTypeSignature; +import static com.google.common.base.MoreObjects.firstNonNull; +import static com.google.common.collect.ImmutableList.toImmutableList; +import static com.google.common.collect.ImmutableSet.toImmutableSet; + +public class HiveAnalyzeProperties +{ + public static final String PARTITIONS_PROPERTY = "partitions"; + + private final List> analyzeProperties; + + @Inject + public HiveAnalyzeProperties(TypeManager typeManager) + { + analyzeProperties = ImmutableList.of( + new PropertyMetadata<>( + PARTITIONS_PROPERTY, + "Partitions to be analyzed", + typeManager.getType(parseTypeSignature("array(array(varchar))")), + List.class, + null, + false, + HiveAnalyzeProperties::decodePartitionLists, + value -> value)); + } + + public List> getAnalyzeProperties() + { + return analyzeProperties; + } + + @SuppressWarnings("unchecked") + public static Optional>> getPartitionList(Map properties) + { + List> partitions = (List>) properties.get(PARTITIONS_PROPERTY); + return partitions == null ? Optional.empty() : Optional.of(partitions); + } + + private static List> decodePartitionLists(Object object) + { + if (object == null) { + return null; + } + + // replace null partition value with hive default partition + return ImmutableList.copyOf(((Collection) object).stream() + .peek(HiveAnalyzeProperties::throwIfNull) + .map(partition -> ((Collection) partition).stream() + .map(name -> firstNonNull((String) name, HIVE_DEFAULT_DYNAMIC_PARTITION)) + .collect(toImmutableList())) + .collect(toImmutableSet())); + } + + private static void throwIfNull(Object object) + { + if (object == null) { + throw new PrestoException(INVALID_ANALYZE_PROPERTY, "Invalid null value in analyze partitions property"); + } + } +} diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientModule.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientModule.java index 63ae156fc0075..b81ee4f9af3f6 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientModule.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveClientModule.java @@ -71,6 +71,7 @@ public void configure(Binder binder) binder.bind(HiveSessionProperties.class).in(Scopes.SINGLETON); binder.bind(HiveTableProperties.class).in(Scopes.SINGLETON); + binder.bind(HiveAnalyzeProperties.class).in(Scopes.SINGLETON); binder.bind(NamenodeStats.class).in(Scopes.SINGLETON); newExporter(binder).export(NamenodeStats.class).as(generatedNameOf(NamenodeStats.class, connectorId)); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnector.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnector.java index 296f6034430df..41f37475075d7 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnector.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnector.java @@ -57,6 +57,8 @@ public class HiveConnector private final List> sessionProperties; private final List> schemaProperties; private final List> tableProperties; + private final List> analyzeProperties; + private final ConnectorAccessControl accessControl; private final ClassLoader classLoader; @@ -75,6 +77,7 @@ public HiveConnector( List> sessionProperties, List> schemaProperties, List> tableProperties, + List> analyzeProperties, ConnectorAccessControl accessControl, ClassLoader classLoader) { @@ -90,6 +93,7 @@ public HiveConnector( this.sessionProperties = ImmutableList.copyOf(requireNonNull(sessionProperties, "sessionProperties is null")); this.schemaProperties = ImmutableList.copyOf(requireNonNull(schemaProperties, "schemaProperties is null")); this.tableProperties = ImmutableList.copyOf(requireNonNull(tableProperties, "tableProperties is null")); + this.analyzeProperties = ImmutableList.copyOf(requireNonNull(analyzeProperties, "analyzeProperties is null")); this.accessControl = requireNonNull(accessControl, "accessControl is null"); this.classLoader = requireNonNull(classLoader, "classLoader is null"); } @@ -150,6 +154,12 @@ public List> getSchemaProperties() return schemaProperties; } + @Override + public List> getAnalyzeProperties() + { + return analyzeProperties; + } + @Override public List> getTableProperties() { diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnectorFactory.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnectorFactory.java index 43e75dcbcbe5f..cceee982dafbc 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnectorFactory.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveConnectorFactory.java @@ -128,6 +128,7 @@ public Connector create(String catalogName, Map config, Connecto ConnectorNodePartitioningProvider connectorDistributionProvider = injector.getInstance(ConnectorNodePartitioningProvider.class); HiveSessionProperties hiveSessionProperties = injector.getInstance(HiveSessionProperties.class); HiveTableProperties hiveTableProperties = injector.getInstance(HiveTableProperties.class); + HiveAnalyzeProperties hiveAnalyzeProperties = injector.getInstance(HiveAnalyzeProperties.class); ConnectorAccessControl accessControl = new PartitionsAwareAccessControl(injector.getInstance(ConnectorAccessControl.class)); Set procedures = injector.getInstance(Key.get(new TypeLiteral>() {})); @@ -144,6 +145,7 @@ public Connector create(String catalogName, Map config, Connecto hiveSessionProperties.getSessionProperties(), HiveSchemaProperties.SCHEMA_PROPERTIES, hiveTableProperties.getTableProperties(), + hiveAnalyzeProperties.getAnalyzeProperties(), accessControl, classLoader); } diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java index 4f5d3ee40220b..e5a7b756a4a55 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveErrorCode.java @@ -63,6 +63,7 @@ public enum HiveErrorCode // HIVE_TOO_MANY_BUCKET_SORT_FILES(36) is deprecated HIVE_CORRUPTED_COLUMN_STATISTICS(37, EXTERNAL), HIVE_EXCEEDED_SPLIT_BUFFERING_LIMIT(38, USER_ERROR), + HIVE_UNKNOWN_COLUMN_STATISTIC_TYPE(39, INTERNAL_ERROR), /**/; private final ErrorCode errorCode; diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java index cd0b7b2357a71..c9fb6ef695a69 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveMetadata.java @@ -64,6 +64,7 @@ import com.facebook.presto.spi.statistics.ColumnStatisticMetadata; import com.facebook.presto.spi.statistics.ColumnStatisticType; import com.facebook.presto.spi.statistics.ComputedStatistics; +import com.facebook.presto.spi.statistics.TableStatisticType; import com.facebook.presto.spi.statistics.TableStatistics; import com.facebook.presto.spi.statistics.TableStatisticsMetadata; import com.facebook.presto.spi.type.Type; @@ -71,7 +72,9 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Joiner; import com.google.common.base.Splitter; +import com.google.common.base.Suppliers; import com.google.common.base.Verify; +import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.ImmutableMap.Builder; @@ -105,9 +108,11 @@ import java.util.Set; import java.util.function.Function; import java.util.function.Predicate; +import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.IntStream; +import static com.facebook.presto.hive.HiveAnalyzeProperties.getPartitionList; import static com.facebook.presto.hive.HiveBasicStatistics.createEmptyStatistics; import static com.facebook.presto.hive.HiveBasicStatistics.createZeroStatistics; import static com.facebook.presto.hive.HiveBucketing.getHiveBucketHandle; @@ -176,13 +181,17 @@ import static com.facebook.presto.hive.util.ConfigurationUtils.toJobConf; import static com.facebook.presto.hive.util.Statistics.ReduceOperator.ADD; import static com.facebook.presto.hive.util.Statistics.createComputedStatisticsToPartitionMap; +import static com.facebook.presto.hive.util.Statistics.createEmptyPartitionStatistics; import static com.facebook.presto.hive.util.Statistics.fromComputedStatistics; import static com.facebook.presto.hive.util.Statistics.reduce; +import static com.facebook.presto.spi.StandardErrorCode.INVALID_ANALYZE_PROPERTY; import static com.facebook.presto.spi.StandardErrorCode.INVALID_SCHEMA_PROPERTY; import static com.facebook.presto.spi.StandardErrorCode.INVALID_TABLE_PROPERTY; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; import static com.facebook.presto.spi.StandardErrorCode.SCHEMA_NOT_EMPTY; import static com.facebook.presto.spi.predicate.TupleDomain.withColumnDomains; +import static com.facebook.presto.spi.statistics.TableStatisticType.ROW_COUNT; +import static com.facebook.presto.spi.type.BigintType.BIGINT; import static com.google.common.base.MoreObjects.firstNonNull; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Verify.verify; @@ -295,6 +304,25 @@ public HiveTableHandle getTableHandle(ConnectorSession session, SchemaTableName return new HiveTableHandle(tableName.getSchemaName(), tableName.getTableName()); } + @Override + public ConnectorTableHandle getTableHandleForStatisticsCollection(ConnectorSession session, SchemaTableName tableName, Map analyzeProperties) + { + HiveTableHandle handle = getTableHandle(session, tableName); + if (handle == null) { + return null; + } + Optional>> partitionValuesList = getPartitionList(analyzeProperties); + ConnectorTableMetadata tableMetadata = getTableMetadata(handle.getSchemaTableName()); + handle = handle.withAnalyzePartitionValues(partitionValuesList); + + List partitionedBy = getPartitionedBy(tableMetadata.getProperties()); + + if (partitionValuesList.isPresent() && partitionedBy.isEmpty()) { + throw new PrestoException(INVALID_ANALYZE_PROPERTY, "Only partitioned table can be analyzed with a partition list"); + } + return handle; + } + @Override public Optional getSystemTable(ConnectorSession session, SchemaTableName tableName) { @@ -905,6 +933,77 @@ public void dropTable(ConnectorSession session, ConnectorTableHandle tableHandle metastore.dropTable(session, handle.getSchemaName(), handle.getTableName()); } + @Override + public ConnectorTableHandle beginStatisticsCollection(ConnectorSession session, ConnectorTableHandle tableHandle) + { + verifyJvmTimeZone(); + HiveTableHandle handle = (HiveTableHandle) tableHandle; + SchemaTableName tableName = handle.getSchemaTableName(); + + metastore.getTable(tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(tableName)); + return handle; + } + + @Override + public void finishStatisticsCollection(ConnectorSession session, ConnectorTableHandle tableHandle, Collection computedStatistics) + { + HiveTableHandle handle = (HiveTableHandle) tableHandle; + SchemaTableName tableName = handle.getSchemaTableName(); + Table table = metastore.getTable(tableName.getSchemaName(), tableName.getTableName()) + .orElseThrow(() -> new TableNotFoundException(handle.getSchemaTableName())); + + List partitionColumns = table.getPartitionColumns(); + List partitionColumnNames = partitionColumns.stream() + .map(Column::getName) + .collect(toImmutableList()); + List hiveColumnHandles = hiveColumnHandles(table); + Map columnTypes = hiveColumnHandles.stream() + .filter(columnHandle -> !columnHandle.isHidden()) + .collect(toImmutableMap(HiveColumnHandle::getName, column -> column.getHiveType().getType(typeManager))); + + Map, ComputedStatistics> computedStatisticsMap = createComputedStatisticsToPartitionMap(computedStatistics, partitionColumnNames, columnTypes); + + if (partitionColumns.isEmpty()) { + // commit analyze to unpartitioned table + metastore.setTableStatistics(table, createPartitionStatistics(session, columnTypes, computedStatisticsMap.get(ImmutableList.of()))); + } + else { + List> partitionValuesList; + if (handle.getAnalyzePartitionValues().isPresent()) { + partitionValuesList = handle.getAnalyzePartitionValues().get(); + } + else { + partitionValuesList = metastore.getPartitionNames(handle.getSchemaName(), handle.getTableName()) + .orElseThrow(() -> new TableNotFoundException(((HiveTableHandle) tableHandle).getSchemaTableName())) + .stream() + .map(HiveUtil::toPartitionValues) + .collect(toImmutableList()); + } + + ImmutableMap.Builder, PartitionStatistics> partitionStatistics = ImmutableMap.builder(); + Map> columnStatisticTypes = hiveColumnHandles.stream() + .filter(columnHandle -> !partitionColumnNames.contains(columnHandle.getName())) + .filter(column -> !column.isHidden()) + .collect(toImmutableMap(HiveColumnHandle::getName, column -> ImmutableSet.copyOf(metastore.getSupportedColumnStatistics(typeManager.getType(column.getTypeSignature()))))); + Supplier emptyPartitionStatistics = Suppliers.memoize(() -> createEmptyPartitionStatistics(columnTypes, columnStatisticTypes)); + + int usedComputedStatistics = 0; + for (List partitionValues : partitionValuesList) { + ComputedStatistics collectedStatistics = computedStatisticsMap.get(partitionValues); + if (collectedStatistics == null) { + partitionStatistics.put(partitionValues, emptyPartitionStatistics.get()); + } + else { + usedComputedStatistics++; + partitionStatistics.put(partitionValues, createPartitionStatistics(session, columnTypes, collectedStatistics)); + } + } + verify(usedComputedStatistics == computedStatistics.size(), "All computed statistics must be used"); + metastore.setPartitionStatistics(table, partitionStatistics.build()); + } + } + @Override public HiveOutputTableHandle beginCreateTable(ConnectorSession session, ConnectorTableMetadata tableMetadata, Optional layout) { @@ -1011,7 +1110,7 @@ public Optional finishCreateTable(ConnectorSession sess .map(PartitionUpdate::getStatistics) .reduce((first, second) -> reduce(first, second, ADD)) .orElse(createZeroStatistics()); - tableStatistics = createPartitionStatistics(session, basicStatistics, ImmutableList.of(), columnTypes, partitionComputedStatistics); + tableStatistics = createPartitionStatistics(session, basicStatistics, columnTypes, getColumnStatistics(partitionComputedStatistics, ImmutableList.of())); } else { tableStatistics = new PartitionStatistics(createEmptyStatistics(), ImmutableMap.of()); @@ -1025,7 +1124,11 @@ public Optional finishCreateTable(ConnectorSession sess } for (PartitionUpdate update : partitionUpdates) { Partition partition = buildPartitionObject(session, table, update); - PartitionStatistics partitionStatistics = createPartitionStatistics(session, update.getStatistics(), partition.getValues(), columnTypes, partitionComputedStatistics); + PartitionStatistics partitionStatistics = createPartitionStatistics( + session, + update.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, partition.getValues())); metastore.addPartition( session, handle.getSchemaName(), @@ -1222,17 +1325,27 @@ public Optional finishInsert(ConnectorSession session, for (PartitionUpdate partitionUpdate : partitionUpdates) { if (partitionUpdate.getName().isEmpty()) { // insert into unpartitioned table + PartitionStatistics partitionStatistics = createPartitionStatistics( + session, + partitionUpdate.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, ImmutableList.of())); metastore.finishInsertIntoExistingTable( session, handle.getSchemaName(), handle.getTableName(), partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), - createPartitionStatistics(session, partitionUpdate.getStatistics(), ImmutableList.of(), columnTypes, partitionComputedStatistics)); + partitionStatistics); } else if (partitionUpdate.getUpdateMode() == APPEND) { // insert into existing partition List partitionValues = toPartitionValues(partitionUpdate.getName()); + PartitionStatistics partitionStatistics = createPartitionStatistics( + session, + partitionUpdate.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, partitionValues)); metastore.finishInsertIntoExistingPartition( session, handle.getSchemaName(), @@ -1240,7 +1353,7 @@ else if (partitionUpdate.getUpdateMode() == APPEND) { partitionValues, partitionUpdate.getWritePath(), partitionUpdate.getFileNames(), - createPartitionStatistics(session, partitionUpdate.getStatistics(), partitionValues, columnTypes, partitionComputedStatistics)); + partitionStatistics); } else if (partitionUpdate.getUpdateMode() == NEW || partitionUpdate.getUpdateMode() == OVERWRITE) { // insert into new partition or overwrite existing partition @@ -1251,13 +1364,12 @@ else if (partitionUpdate.getUpdateMode() == NEW || partitionUpdate.getUpdateMode if (partitionUpdate.getUpdateMode() == OVERWRITE) { metastore.dropPartition(session, handle.getSchemaName(), handle.getTableName(), partition.getValues()); } - metastore.addPartition( + PartitionStatistics partitionStatistics = createPartitionStatistics( session, - handle.getSchemaName(), - handle.getTableName(), - partition, - partitionUpdate.getWritePath(), - createPartitionStatistics(session, partitionUpdate.getStatistics(), partition.getValues(), columnTypes, partitionComputedStatistics)); + partitionUpdate.getStatistics(), + columnTypes, + getColumnStatistics(partitionComputedStatistics, partition.getValues())); + metastore.addPartition(session, handle.getSchemaName(), handle.getTableName(), partition, partitionUpdate.getWritePath(), partitionStatistics); } else { throw new IllegalArgumentException(format("Unsupported update mode: %s", partitionUpdate.getUpdateMode())); @@ -1291,16 +1403,27 @@ private Partition buildPartitionObject(ConnectorSession session, Table table, Pa .build(); } + private PartitionStatistics createPartitionStatistics( + ConnectorSession session, + Map columnTypes, + ComputedStatistics computedStatistics) + { + Map computedColumnStatistics = computedStatistics.getColumnStatistics(); + + Block rowCountBlock = Optional.ofNullable(computedStatistics.getTableStatistics().get(ROW_COUNT)) + .orElseThrow(() -> new VerifyException("rowCount not present")); + verify(!rowCountBlock.isNull(0), "rowCount must never be null"); + long rowCount = BIGINT.getLong(rowCountBlock, 0); + HiveBasicStatistics rowCountOnlyBasicStatistics = new HiveBasicStatistics(OptionalLong.empty(), OptionalLong.of(rowCount), OptionalLong.empty(), OptionalLong.empty()); + return createPartitionStatistics(session, rowCountOnlyBasicStatistics, columnTypes, computedColumnStatistics); + } + private PartitionStatistics createPartitionStatistics( ConnectorSession session, HiveBasicStatistics basicStatistics, - List partitionValues, Map columnTypes, - Map, ComputedStatistics> partitionComputedStatistics) + Map computedColumnStatistics) { - Map computedColumnStatistics = Optional.ofNullable(partitionComputedStatistics.get(partitionValues)) - .map(ComputedStatistics::getColumnStatistics) - .orElse(ImmutableMap.of()); long rowCount = basicStatistics.getRowCount().orElseThrow(() -> new IllegalArgumentException("rowCount not present")); Map columnStatistics = fromComputedStatistics( session, @@ -1311,6 +1434,13 @@ private PartitionStatistics createPartitionStatistics( return new PartitionStatistics(basicStatistics, columnStatistics); } + private Map getColumnStatistics(Map, ComputedStatistics> partitionComputedStatistics, List partitionValues) + { + return Optional.ofNullable(partitionComputedStatistics.get(partitionValues)) + .map(ComputedStatistics::getColumnStatistics) + .orElse(ImmutableMap.of()); + } + @Override public void createView(ConnectorSession session, SchemaTableName viewName, String viewData, boolean replace) { @@ -1475,7 +1605,14 @@ public boolean supportsMetadataDelete(ConnectorSession session, ConnectorTableHa public List getTableLayouts(ConnectorSession session, ConnectorTableHandle tableHandle, Constraint constraint, Optional> desiredColumns) { HiveTableHandle handle = (HiveTableHandle) tableHandle; - HivePartitionResult hivePartitionResult = partitionManager.getPartitions(metastore, tableHandle, constraint); + HivePartitionResult hivePartitionResult; + if (handle.getAnalyzePartitionValues().isPresent()) { + verify(constraint.getSummary().isAll(), "There shouldn't be any constraint for ANALYZE operation"); + hivePartitionResult = partitionManager.getPartitions(metastore, tableHandle, handle.getAnalyzePartitionValues().get()); + } + else { + hivePartitionResult = partitionManager.getPartitions(metastore, tableHandle, constraint); + } return ImmutableList.of(new ConnectorTableLayoutResult( getTableLayout( @@ -1723,16 +1860,23 @@ public Optional getNewTableLayout(ConnectorSession sess } @Override - public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession session, ConnectorTableMetadata tableMetadata) + public TableStatisticsMetadata getStatisticsCollectionMetadataForWrite(ConnectorSession session, ConnectorTableMetadata tableMetadata) { if (!isCollectColumnStatisticsOnWrite(session)) { return TableStatisticsMetadata.empty(); } List partitionedBy = firstNonNull(getPartitionedBy(tableMetadata.getProperties()), ImmutableList.of()); - return getStatisticsCollectionMetadata(tableMetadata.getColumns(), partitionedBy); + return getStatisticsCollectionMetadata(tableMetadata.getColumns(), partitionedBy, false); } - private TableStatisticsMetadata getStatisticsCollectionMetadata(List columns, List partitionedBy) + @Override + public TableStatisticsMetadata getStatisticsCollectionMetadata(ConnectorSession session, ConnectorTableMetadata tableMetadata) + { + List partitionedBy = firstNonNull(getPartitionedBy(tableMetadata.getProperties()), ImmutableList.of()); + return getStatisticsCollectionMetadata(tableMetadata.getColumns(), partitionedBy, true); + } + + private TableStatisticsMetadata getStatisticsCollectionMetadata(List columns, List partitionedBy, boolean includeRowCount) { Set columnStatistics = columns.stream() .filter(column -> !partitionedBy.contains(column.getName())) @@ -1740,7 +1884,9 @@ private TableStatisticsMetadata getStatisticsCollectionMetadata(List tableStatistics = includeRowCount ? ImmutableSet.of(ROW_COUNT) : ImmutableSet.of(); + return new TableStatisticsMetadata(columnStatistics, tableStatistics, partitionedBy); } private List getColumnStatisticMetadata(ColumnMetadata columnMetadata) diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java b/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java index de0c970bd861d..02edda8626eff 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HivePartitionManager.java @@ -42,6 +42,7 @@ import com.facebook.presto.spi.type.TypeManager; import com.facebook.presto.spi.type.VarcharType; import com.google.common.base.Predicates; +import com.google.common.base.VerifyException; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; @@ -64,11 +65,16 @@ import static com.facebook.presto.hive.HiveUtil.getPartitionKeyColumnHandles; import static com.facebook.presto.hive.HiveUtil.parsePartitionValue; import static com.facebook.presto.hive.metastore.MetastoreUtil.getProtectMode; +import static com.facebook.presto.hive.metastore.MetastoreUtil.makePartName; import static com.facebook.presto.hive.metastore.MetastoreUtil.verifyOnline; +import static com.facebook.presto.spi.Constraint.alwaysTrue; import static com.facebook.presto.spi.StandardErrorCode.NOT_SUPPORTED; +import static com.facebook.presto.spi.predicate.TupleDomain.all; +import static com.facebook.presto.spi.predicate.TupleDomain.none; import static com.facebook.presto.spi.type.Chars.padSpaces; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Predicates.not; +import static com.google.common.collect.ImmutableList.toImmutableList; import static java.lang.String.format; import static java.util.Objects.requireNonNull; import static java.util.stream.Collectors.toList; @@ -119,7 +125,7 @@ public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastor List partitionColumns = getPartitionKeyColumnHandles(table); if (effectivePredicate.isNone()) { - return new HivePartitionResult(partitionColumns, ImmutableList.of(), TupleDomain.none(), TupleDomain.none(), TupleDomain.none(), hiveBucketHandle, Optional.empty()); + return new HivePartitionResult(partitionColumns, ImmutableList.of(), none(), none(), none(), hiveBucketHandle, Optional.empty()); } Optional bucketFilter = getHiveBucketFilter(table, effectivePredicate); @@ -131,7 +137,7 @@ public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastor ImmutableList.of(new HivePartition(tableName)), compactEffectivePredicate, effectivePredicate, - TupleDomain.none(), + none(), hiveBucketHandle, bucketFilter); } @@ -155,6 +161,27 @@ public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastor return new HivePartitionResult(partitionColumns, partitionsIterable, compactEffectivePredicate, remainingTupleDomain, enforcedTupleDomain, hiveBucketHandle, bucketFilter); } + public HivePartitionResult getPartitions(SemiTransactionalHiveMetastore metastore, ConnectorTableHandle tableHandle, List> partitionValuesList) + { + HiveTableHandle hiveTableHandle = (HiveTableHandle) tableHandle; + SchemaTableName tableName = hiveTableHandle.getSchemaTableName(); + + Table table = getTable(metastore, tableName); + + List partitionColumns = getPartitionKeyColumnHandles(table); + List partitionColumnTypes = partitionColumns.stream() + .map(column -> typeManager.getType(column.getTypeSignature())) + .collect(toImmutableList()); + + List partitionList = partitionValuesList.stream() + .map(partitionValues -> makePartName(table.getPartitionColumns(), partitionValues)) + .map(partitionName -> parseValuesAndFilterPartition(tableName, partitionName, partitionColumns, partitionColumnTypes, alwaysTrue())) + .map(partition -> partition.orElseThrow(() -> new VerifyException("partition must exist"))) + .collect(toImmutableList()); + + return new HivePartitionResult(partitionColumns, partitionList, all(), all(), none(), getHiveBucketHandle(table), Optional.empty()); + } + private static TupleDomain toCompactTupleDomain(TupleDomain effectivePredicate, int threshold) { ImmutableMap.Builder builder = ImmutableMap.builder(); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/HiveTableHandle.java b/presto-hive/src/main/java/com/facebook/presto/hive/HiveTableHandle.java index 4fe48dc857e5e..2124d869d8182 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/HiveTableHandle.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/HiveTableHandle.java @@ -18,8 +18,11 @@ import com.fasterxml.jackson.annotation.JsonCreator; import com.fasterxml.jackson.annotation.JsonProperty; +import java.util.List; import java.util.Objects; +import java.util.Optional; +import static com.google.common.base.MoreObjects.toStringHelper; import static java.util.Objects.requireNonNull; public class HiveTableHandle @@ -28,13 +31,27 @@ public class HiveTableHandle private final String schemaName; private final String tableName; + private final Optional>> analyzePartitionValues; + @JsonCreator public HiveTableHandle( @JsonProperty("schemaName") String schemaName, - @JsonProperty("tableName") String tableName) + @JsonProperty("tableName") String tableName, + @JsonProperty("analyzePartitionValues") Optional>> analyzePartitionValues) { this.schemaName = requireNonNull(schemaName, "schemaName is null"); this.tableName = requireNonNull(tableName, "tableName is null"); + this.analyzePartitionValues = requireNonNull(analyzePartitionValues, "analyzePartitionValues is null"); + } + + public HiveTableHandle(String schemaName, String tableName) + { + this(schemaName, tableName, Optional.empty()); + } + + public HiveTableHandle withAnalyzePartitionValues(Optional>> analyzePartitionValues) + { + return new HiveTableHandle(schemaName, tableName, analyzePartitionValues); } @JsonProperty @@ -49,34 +66,45 @@ public String getTableName() return tableName; } - public SchemaTableName getSchemaTableName() + @JsonProperty + public Optional>> getAnalyzePartitionValues() { - return new SchemaTableName(schemaName, tableName); + return analyzePartitionValues; } - @Override - public int hashCode() + public SchemaTableName getSchemaTableName() { - return Objects.hash(schemaName, tableName); + return new SchemaTableName(schemaName, tableName); } @Override - public boolean equals(Object obj) + public boolean equals(Object o) { - if (this == obj) { + if (this == o) { return true; } - if (obj == null || getClass() != obj.getClass()) { + if (o == null || getClass() != o.getClass()) { return false; } - HiveTableHandle other = (HiveTableHandle) obj; - return Objects.equals(this.schemaName, other.schemaName) && - Objects.equals(this.tableName, other.tableName); + HiveTableHandle that = (HiveTableHandle) o; + return Objects.equals(schemaName, that.schemaName) && + Objects.equals(tableName, that.tableName) && + Objects.equals(analyzePartitionValues, that.analyzePartitionValues); + } + + @Override + public int hashCode() + { + return Objects.hash(schemaName, tableName, analyzePartitionValues); } @Override public String toString() { - return schemaName + ":" + tableName; + return toStringHelper(this) + .add("schemaName", schemaName) + .add("tableName", tableName) + .add("analyzePartitionValues", analyzePartitionValues) + .toString(); } } diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java index 1bbe0baeae224..57ad58cf9d10a 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/metastore/SemiTransactionalHiveMetastore.java @@ -15,6 +15,7 @@ import com.facebook.presto.hive.HdfsEnvironment; import com.facebook.presto.hive.HdfsEnvironment.HdfsContext; +import com.facebook.presto.hive.HiveBasicStatistics; import com.facebook.presto.hive.HiveType; import com.facebook.presto.hive.LocationHandle.WriteMode; import com.facebook.presto.hive.PartitionNotFoundException; @@ -51,6 +52,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.OptionalLong; import java.util.Set; import java.util.concurrent.CompletableFuture; import java.util.concurrent.Executor; @@ -295,6 +297,45 @@ public synchronized void renameDatabase(String source, String target) setExclusive((delegate, hdfsEnvironment) -> delegate.renameDatabase(source, target)); } + // TODO: Allow updating statistics for 2 tables in the same transaction + public synchronized void setTableStatistics(Table table, PartitionStatistics tableStatistics) + { + setExclusive((delegate, hdfsEnvironment) -> + delegate.updateTableStatistics(table.getDatabaseName(), table.getTableName(), statistics -> updatePartitionStatistics(statistics, tableStatistics))); + } + + // TODO: Allow updating statistics for 2 tables in the same transaction + public synchronized void setPartitionStatistics(Table table, Map, PartitionStatistics> partitionStatisticsMap) + { + setExclusive((delegate, hdfsEnvironment) -> + partitionStatisticsMap.forEach((partitionValues, newPartitionStats) -> + delegate.updatePartitionStatistics( + table.getDatabaseName(), + table.getTableName(), + getPartitionName(table, partitionValues), + oldPartitionStats -> updatePartitionStatistics(oldPartitionStats, newPartitionStats)))); + } + + // For HiveBasicStatistics, we only overwrite the original statistics if the new one is not empty. + // For HiveColumnStatistics, we always overwrite every statistics. + // TODO: Collect file count, on-disk size and in-memory size during ANALYZE + private PartitionStatistics updatePartitionStatistics(PartitionStatistics oldPartitionStats, PartitionStatistics newPartitionStats) + { + HiveBasicStatistics oldBasicStatistics = oldPartitionStats.getBasicStatistics(); + HiveBasicStatistics newBasicStatistics = newPartitionStats.getBasicStatistics(); + HiveBasicStatistics updatedBasicStatistics = new HiveBasicStatistics( + firstPresent(newBasicStatistics.getFileCount(), oldBasicStatistics.getFileCount()), + firstPresent(newBasicStatistics.getRowCount(), oldBasicStatistics.getRowCount()), + firstPresent(newBasicStatistics.getInMemoryDataSizeInBytes(), oldBasicStatistics.getInMemoryDataSizeInBytes()), + firstPresent(newBasicStatistics.getOnDiskDataSizeInBytes(), oldBasicStatistics.getOnDiskDataSizeInBytes())); + return new PartitionStatistics(updatedBasicStatistics, newPartitionStats.getColumnStatistics()); + } + + private static OptionalLong firstPresent(OptionalLong first, OptionalLong second) + { + return first.isPresent() ? first : second; + } + /** * {@code currentLocation} needs to be supplied if a writePath exists for the table. */ @@ -727,6 +768,11 @@ private String getPartitionName(String databaseName, String tableName, List new TableNotFoundException(new SchemaTableName(databaseName, tableName))); + return getPartitionName(table, partitionValues); + } + + private String getPartitionName(Table table, List partitionValues) + { List columnNames = table.getPartitionColumns().stream() .map(Column::getName) .collect(toImmutableList()); diff --git a/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java b/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java index 1ae5c3b555f84..661e606a89649 100644 --- a/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java +++ b/presto-hive/src/main/java/com/facebook/presto/hive/util/Statistics.java @@ -23,6 +23,7 @@ import com.facebook.presto.hive.metastore.IntegerStatistics; import com.facebook.presto.spi.ConnectorSession; import com.facebook.presto.spi.Page; +import com.facebook.presto.spi.PrestoException; import com.facebook.presto.spi.block.Block; import com.facebook.presto.spi.statistics.ColumnStatisticMetadata; import com.facebook.presto.spi.statistics.ColumnStatisticType; @@ -46,6 +47,8 @@ import java.util.OptionalLong; import java.util.Set; +import static com.facebook.presto.hive.HiveBasicStatistics.createZeroStatistics; +import static com.facebook.presto.hive.HiveErrorCode.HIVE_UNKNOWN_COLUMN_STATISTIC_TYPE; import static com.facebook.presto.hive.HiveWriteUtils.createPartitionValues; import static com.facebook.presto.hive.util.Statistics.ReduceOperator.ADD; import static com.facebook.presto.hive.util.Statistics.ReduceOperator.MAX; @@ -70,6 +73,7 @@ import static com.google.common.collect.ImmutableList.toImmutableList; import static com.google.common.collect.ImmutableMap.toImmutableMap; import static com.google.common.collect.Sets.intersection; +import static java.util.Objects.requireNonNull; import static java.util.concurrent.TimeUnit.MILLISECONDS; public final class Statistics @@ -244,6 +248,67 @@ private static > T min(T first, T second) return first.compareTo(second) <= 0 ? first : second; } + public static PartitionStatistics createEmptyPartitionStatistics(Map columnTypes, Map> columnStatisticsMetadataTypes) + { + Map columnStatistics = columnStatisticsMetadataTypes.entrySet().stream() + .collect(toImmutableMap(Entry::getKey, entry -> createColumnStatisticsForEmptyPartition(columnTypes.get(entry.getKey()), entry.getValue()))); + return new PartitionStatistics(createZeroStatistics(), columnStatistics); + } + + private static HiveColumnStatistics createColumnStatisticsForEmptyPartition(Type columnType, Set columnStatisticTypes) + { + requireNonNull(columnType, "columnType is null"); + HiveColumnStatistics.Builder result = HiveColumnStatistics.builder(); + for (ColumnStatisticType columnStatisticType : columnStatisticTypes) { + switch (columnStatisticType) { + case MAX_VALUE_SIZE_IN_BYTES: + result.setMaxValueSizeInBytes(0); + break; + case TOTAL_SIZE_IN_BYTES: + result.setTotalSizeInBytes(0); + break; + case NUMBER_OF_DISTINCT_VALUES: + result.setDistinctValuesCount(0); + break; + case NUMBER_OF_NON_NULL_VALUES: + result.setNullsCount(0); + break; + case NUMBER_OF_TRUE_VALUES: + result.setBooleanStatistics(new BooleanStatistics(OptionalLong.of(0L), OptionalLong.of(0L))); + break; + case MIN_VALUE: + case MAX_VALUE: + setMinMaxForEmptyPartition(columnType, result); + break; + default: + throw new PrestoException(HIVE_UNKNOWN_COLUMN_STATISTIC_TYPE, "Unknown column statistics type: " + columnStatisticType.name()); + } + } + return result.build(); + } + + private static void setMinMaxForEmptyPartition(Type type, HiveColumnStatistics.Builder result) + { + if (type.equals(BIGINT) || type.equals(INTEGER) || type.equals(SMALLINT) || type.equals(TINYINT)) { + result.setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())); + } + else if (type.equals(DOUBLE) || type.equals(REAL)) { + result.setDoubleStatistics(new DoubleStatistics(OptionalDouble.empty(), OptionalDouble.empty())); + } + else if (type.equals(DATE)) { + result.setDateStatistics(new DateStatistics(Optional.empty(), Optional.empty())); + } + else if (type.equals(TIMESTAMP)) { + result.setIntegerStatistics(new IntegerStatistics(OptionalLong.empty(), OptionalLong.empty())); + } + else if (type instanceof DecimalType) { + result.setDecimalStatistics(new DecimalStatistics(Optional.empty(), Optional.empty())); + } + else { + throw new IllegalArgumentException("Unexpected type: " + type); + } + } + public static Map, ComputedStatistics> createComputedStatisticsToPartitionMap( Collection computedStatistics, List partitionColumns, diff --git a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java index 9286b43098eef..0bd99e5a7d976 100644 --- a/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java +++ b/presto-hive/src/test/java/com/facebook/presto/hive/TestHiveIntegrationSmokeTest.java @@ -3002,6 +3002,335 @@ public void testCollectColumnStatisticsOnInsert() assertUpdate(format("DROP TABLE %s", tableName)); } + @Test + public void testAnalyzeEmptyTable() + { + String tableName = "test_analyze_empty_table"; + assertUpdate(format("CREATE TABLE %s (c_bigint BIGINT, c_varchar VARCHAR(2))", tableName)); + assertUpdate("ANALYZE " + tableName, 0); + } + + @Test + public void testInvalidAnalyzePartitionedTable() + { + String tableName = "test_invalid_analyze_partitioned_table"; + + // Test table does not exist + assertQueryFails("ANALYZE " + tableName, format(".*Table 'hive.tpch.%s' does not exist.*", tableName)); + + createPartitionedTableForAnalyzeTest(tableName); + + // Test invalid property + assertQueryFails(format("ANALYZE %s WITH (error = 1)", tableName), ".*'hive' does not support analyze property 'error'.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = 1)", tableName), ".*Cannot convert '1' to \\Qarray(array(varchar))\\E.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = NULL)", tableName), ".*Invalid null value for analyze property.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[NULL])", tableName), ".*Invalid null value in analyze partitions property.*"); + + // Test non-existed partition + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p4', '10']])", tableName), ".*Partition no longer exists.*"); + + // Test partition schema mismatch + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p4']])", tableName), ".*Partition value count does not match the partition column count.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p4', '10', 'error']])", tableName), ".*Partition value count does not match the partition column count.*"); + + // Drop the partitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testInvalidAnalyzeUnpartitionedTable() + { + String tableName = "test_invalid_analyze_unpartitioned_table"; + + // Test table does not exist + assertQueryFails("ANALYZE " + tableName, ".*Table.*does not exist.*"); + + createUnpartitionedTableForAnalyzeTest(tableName); + + // Test partition properties on unpartitioned table + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[])", tableName), ".*Only partitioned table can be analyzed with a partition list.*"); + assertQueryFails(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p1']])", tableName), ".*Only partitioned table can be analyzed with a partition list.*"); + + // Drop the partitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testAnalyzePartitionedTable() + { + String tableName = "test_analyze_partitioned_table"; + createPartitionedTableForAnalyzeTest(tableName); + + // No column stats before running analyze + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); + + // No column stats after running an empty analyze + assertUpdate(format("ANALYZE %s WITH (partitions = ARRAY[])", tableName), 0); + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); + + // Run analyze on 3 partitions including a null partition and a duplicate partition + assertUpdate(format("ANALYZE %s WITH (partitions = ARRAY[ARRAY['p1', '7'], ARRAY['p2', '7'], ARRAY['p2', '7'], ARRAY[NULL, NULL]])", tableName), 12); + + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '0', '1'), " + + "('c_double', null, 2.0, 0.5, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '1', '2'), " + + "('c_double', null, 2.0, 0.5, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar IS NULL AND p_bigint IS NULL)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 1.0, 0.0, null, null, null), " + + "('c_bigint', null, 4.0, 0.0, null, '4', '7'), " + + "('c_double', null, 4.0, 0.0, null, '4.7', '7.7'), " + + "('c_timestamp', null, 4.0, 0.0, null, null, null), " + + "('c_varchar', 16.0, 4.0, 0.0, null, null, null), " + + "('c_varbinary', 8.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 1.0, null, null, null), " + + "('p_bigint', null, 0.0, 1.0, null, null, null), " + + "(null, null, null, null, 4.0, null, null)"); + + // Partition [p3, 8], [e1, 9], [e2, 9] have no column stats + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3' AND p_bigint = 8)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '8', '8'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e1' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e2' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + + // Run analyze on the whole table + assertUpdate("ANALYZE " + tableName, 16); + + // All partitions except empty partitions have column stats + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p1' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '0', '1'), " + + "('c_double', null, 2.0, 0.5, null, '1.2', '2.2'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p2' AND p_bigint = 7)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '1', '2'), " + + "('c_double', null, 2.0, 0.5, null, '2.3', '3.3'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '7', '7'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar IS NULL AND p_bigint IS NULL)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 1.0, 0.0, null, null, null), " + + "('c_bigint', null, 4.0, 0.0, null, '4', '7'), " + + "('c_double', null, 4.0, 0.0, null, '4.7', '7.7'), " + + "('c_timestamp', null, 4.0, 0.0, null, null, null), " + + "('c_varchar', 16.0, 4.0, 0.0, null, null, null), " + + "('c_varbinary', 8.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 1.0, null, null, null), " + + "('p_bigint', null, 0.0, 1.0, null, null, null), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'p3' AND p_bigint = 8)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.5, null, null, null), " + + "('c_bigint', null, 2.0, 0.5, null, '2', '3'), " + + "('c_double', null, 2.0, 0.5, null, '3.4', '4.4'), " + + "('c_timestamp', null, 2.0, 0.5, null, null, null), " + + "('c_varchar', 8.0, 2.0, 0.5, null, null, null), " + + "('c_varbinary', 4.0, null, 0.5, null, null, null), " + + "('p_varchar', 8.0, 1.0, 0.0, null, null, null), " + + "('p_bigint', null, 1.0, 0.0, null, '8', '8'), " + + "(null, null, null, null, 4.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e1' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 0.0, 0.0, null, null, null), " + + "('c_bigint', null, 0.0, 0.0, null, null, null), " + + "('c_double', null, 0.0, 0.0, null, null, null), " + + "('c_timestamp', null, 0.0, 0.0, null, null, null), " + + "('c_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('c_varbinary', 0.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + assertQuery(format("SHOW STATS FOR (SELECT * FROM %s WHERE p_varchar = 'e2' AND p_bigint = 9)", tableName), + "SELECT * FROM VALUES " + + "('c_boolean', null, 0.0, 0.0, null, null, null), " + + "('c_bigint', null, 0.0, 0.0, null, null, null), " + + "('c_double', null, 0.0, 0.0, null, null, null), " + + "('c_timestamp', null, 0.0, 0.0, null, null, null), " + + "('c_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('c_varbinary', 0.0, null, 0.0, null, null, null), " + + "('p_varchar', 0.0, 0.0, 0.0, null, null, null), " + + "('p_bigint', null, 0.0, 0.0, null, null, null), " + + "(null, null, null, null, 0.0, null, null)"); + + // Drop the partitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + @Test + public void testAnalyzeUnpartitionedTable() + { + String tableName = "test_analyze_unpartitioned_table"; + createUnpartitionedTableForAnalyzeTest(tableName); + + // No column stats before running analyze + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, null, null, null, null, null), " + + "('c_bigint', null, null, null, null, null, null), " + + "('c_double', null, null, null, null, null, null), " + + "('c_timestamp', null, null, null, null, null, null), " + + "('c_varchar', null, null, null, null, null, null), " + + "('c_varbinary', null, null, null, null, null, null), " + + "('p_varchar', null, null, null, null, null, null), " + + "('p_bigint', null, null, null, null, null, null), " + + "(null, null, null, null, 16.0, null, null)"); + + // Run analyze on the whole table + assertUpdate("ANALYZE " + tableName, 16); + + assertQuery("SHOW STATS FOR " + tableName, + "SELECT * FROM VALUES " + + "('c_boolean', null, 2.0, 0.375, null, null, null), " + + "('c_bigint', null, 8.0, 0.375, null, '0', '7'), " + + "('c_double', null, 10.0, 0.375, null, '1.2', '7.7'), " + + "('c_timestamp', null, 10.0, 0.375, null, null, null), " + + "('c_varchar', 40.0, 10.0, 0.375, null, null, null), " + + "('c_varbinary', 20.0, null, 0.375, null, null, null), " + + "('p_varchar', 24.0, 3.0, 0.25, null, null, null), " + + "('p_bigint', null, 2.0, 0.25, null, '7', '8'), " + + "(null, null, null, null, 16.0, null, null)"); + + // Drop the unpartitioned test table + assertUpdate(format("DROP TABLE %s", tableName)); + } + + protected void createPartitionedTableForAnalyzeTest(String tableName) + { + createTableForAnalyzeTest(tableName, true); + } + + protected void createUnpartitionedTableForAnalyzeTest(String tableName) + { + createTableForAnalyzeTest(tableName, false); + } + + private void createTableForAnalyzeTest(String tableName, boolean partitioned) + { + Session defaultSession = getSession(); + + // Disable column statistics collection when creating the table + Session disableColumnStatsSession = Session.builder(defaultSession) + .setCatalogSessionProperty(defaultSession.getCatalog().get(), "collect_column_statistics_on_write", "false") + .build(); + + assertUpdate( + disableColumnStatsSession, + "" + + "CREATE TABLE " + + tableName + + (partitioned ? " WITH (partitioned_by = ARRAY['p_varchar', 'p_bigint'])\n" : " ") + + "AS " + + "SELECT c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar, p_bigint " + + "FROM ( " + + " VALUES " + + // p_varchar = 'p1', p_bigint = BIGINT '7' + " (null, null, null, null, null, null, 'p1', BIGINT '7'), " + + " (null, null, null, null, null, null, 'p1', BIGINT '7'), " + + " (true, BIGINT '1', DOUBLE '2.2', TIMESTAMP '2012-08-08 01:00', 'abc1', X'bcd1', 'p1', BIGINT '7'), " + + " (false, BIGINT '0', DOUBLE '1.2', TIMESTAMP '2012-08-08 00:00', 'abc2', X'bcd2', 'p1', BIGINT '7'), " + + // p_varchar = 'p2', p_bigint = BIGINT '7' + " (null, null, null, null, null, null, 'p2', BIGINT '7'), " + + " (null, null, null, null, null, null, 'p2', BIGINT '7'), " + + " (true, BIGINT '2', DOUBLE '3.3', TIMESTAMP '2012-09-09 01:00', 'cba1', X'dcb1', 'p2', BIGINT '7'), " + + " (false, BIGINT '1', DOUBLE '2.3', TIMESTAMP '2012-09-09 00:00', 'cba2', X'dcb2', 'p2', BIGINT '7'), " + + // p_varchar = 'p3', p_bigint = BIGINT '8' + " (null, null, null, null, null, null, 'p3', BIGINT '8'), " + + " (null, null, null, null, null, null, 'p3', BIGINT '8'), " + + " (true, BIGINT '3', DOUBLE '4.4', TIMESTAMP '2012-10-10 01:00', 'bca1', X'cdb1', 'p3', BIGINT '8'), " + + " (false, BIGINT '2', DOUBLE '3.4', TIMESTAMP '2012-10-10 00:00', 'bca2', X'cdb2', 'p3', BIGINT '8'), " + + // p_varchar = NULL, p_bigint = NULL + " (false, BIGINT '7', DOUBLE '7.7', TIMESTAMP '1977-07-07 07:07', 'efa1', X'efa1', NULL, NULL), " + + " (false, BIGINT '6', DOUBLE '6.7', TIMESTAMP '1977-07-07 07:06', 'efa2', X'efa2', NULL, NULL), " + + " (false, BIGINT '5', DOUBLE '5.7', TIMESTAMP '1977-07-07 07:05', 'efa3', X'efa3', NULL, NULL), " + + " (false, BIGINT '4', DOUBLE '4.7', TIMESTAMP '1977-07-07 07:04', 'efa4', X'efa4', NULL, NULL) " + + ") AS x (c_boolean, c_bigint, c_double, c_timestamp, c_varchar, c_varbinary, p_varchar, p_bigint)", 16); + + if (partitioned) { + // Create empty partitions + assertUpdate(disableColumnStatsSession, format("CALL system.create_empty_partition('%s', '%s', ARRAY['p_varchar', 'p_bigint'], ARRAY['%s', '%s'])", TPCH_SCHEMA, tableName, "e1", "9")); + assertUpdate(disableColumnStatsSession, format("CALL system.create_empty_partition('%s', '%s', ARRAY['p_varchar', 'p_bigint'], ARRAY['%s', '%s'])", TPCH_SCHEMA, tableName, "e2", "9")); + } + } + @Test public void testInsertMultipleColumnsFromSameChannel() { diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java index 19d982270d388..aac3355d38f0d 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestExternalHiveTable.java @@ -25,10 +25,12 @@ import static com.facebook.presto.tests.utils.QueryExecutors.onHive; import static com.facebook.presto.tests.utils.QueryExecutors.onPresto; import static io.prestodb.tempto.Requirements.compose; +import static io.prestodb.tempto.assertions.QueryAssert.Row.row; import static io.prestodb.tempto.assertions.QueryAssert.assertThat; import static io.prestodb.tempto.fulfillment.table.MutableTablesState.mutableTablesState; import static io.prestodb.tempto.fulfillment.table.TableRequirements.mutableTable; import static io.prestodb.tempto.fulfillment.table.hive.tpch.TpchTableDefinitions.NATION; +import static io.prestodb.tempto.query.QueryExecutor.query; public class TestExternalHiveTable extends ProductTest @@ -43,6 +45,43 @@ public Requirement getRequirements(Configuration configuration) mutableTable(NATION_PARTITIONED_BY_BIGINT_REGIONKEY)); } + @Test + public void testShowStatisticsForExternalTable() + { + TableInstance nation = mutableTablesState().get(NATION_PARTITIONED_BY_BIGINT_REGIONKEY.getName()); + onHive().executeQuery("DROP TABLE IF EXISTS " + EXTERNAL_TABLE_NAME); + onHive().executeQuery("CREATE EXTERNAL TABLE " + EXTERNAL_TABLE_NAME + " LIKE " + nation.getNameInDatabase() + " LOCATION '/tmp/" + EXTERNAL_TABLE_NAME + "_" + nation.getNameInDatabase() + "'"); + insertNationPartition(nation, 1); + + onHive().executeQuery("ANALYZE TABLE " + EXTERNAL_TABLE_NAME + " PARTITION (p_regionkey) COMPUTE STATISTICS"); + assertThat(query("SHOW STATS FOR " + EXTERNAL_TABLE_NAME)).containsOnly( + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row(null, null, null, null, 5.0, null, null)); + + onHive().executeQuery("ANALYZE TABLE " + EXTERNAL_TABLE_NAME + " PARTITION (p_regionkey) COMPUTE STATISTICS FOR COLUMNS"); + assertThat(query("SHOW STATS FOR " + EXTERNAL_TABLE_NAME)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row(null, null, null, null, 5.0, null, null)); + } + + @Test + public void testAnalyzeExternalTable() + { + TableInstance nation = mutableTablesState().get(NATION_PARTITIONED_BY_BIGINT_REGIONKEY.getName()); + onHive().executeQuery("DROP TABLE IF EXISTS " + EXTERNAL_TABLE_NAME); + onHive().executeQuery("CREATE EXTERNAL TABLE " + EXTERNAL_TABLE_NAME + " LIKE " + nation.getNameInDatabase() + " LOCATION '/tmp/" + EXTERNAL_TABLE_NAME + "_" + nation.getNameInDatabase() + "'"); + insertNationPartition(nation, 1); + + // Running ANALYZE on an external table is allowed as long as the user has the privileges. + assertThat(query("ANALYZE hive.default." + EXTERNAL_TABLE_NAME)).containsExactly(row(5)); + } + @Test public void testInsertIntoExternalTable() { diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveBasicTableStatistics.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveBasicTableStatistics.java index 4a5b78b14afc8..e9f033422e15b 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveBasicTableStatistics.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveBasicTableStatistics.java @@ -144,6 +144,80 @@ public void testCreatePartitioned() } } + @Test(groups = {HIVE_TABLE_STATISTICS}) + public void testAnalyzePartitioned() + { + String tableName = "test_basic_statistics_analyze_partitioned"; + + onPresto().executeQuery("DROP TABLE IF EXISTS " + tableName); + onPresto().executeQuery(format("" + + "CREATE TABLE %s " + + "WITH ( " + + " partitioned_by = ARRAY['n_regionkey'], " + + " bucketed_by = ARRAY['n_nationkey'], " + + " bucket_count = 10 " + + ") " + + "AS " + + "SELECT n_nationkey, n_name, n_comment, n_regionkey " + + "FROM nation " + + "WHERE n_regionkey = 1", tableName)); + + try { + BasicStatistics tableStatistics = getBasicStatisticsForTable(onHive(), tableName); + assertThatStatisticsAreNotPresent(tableStatistics); + + BasicStatistics partitionStatisticsBefore = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=1"); + assertThatStatisticsArePresent(partitionStatisticsBefore); + + // run ANALYZE + onPresto().executeQuery(format("ANALYZE %s", tableName)); + BasicStatistics partitionStatisticsAfter = getBasicStatisticsForPartition(onHive(), tableName, "n_regionkey=1"); + assertThatStatisticsArePresent(partitionStatisticsAfter); + + // ANALYZE must not change the basic stats + assertThat(partitionStatisticsBefore.getNumRows().getAsLong()).isEqualTo(partitionStatisticsAfter.getNumRows().getAsLong()); + assertThat(partitionStatisticsBefore.getNumFiles().getAsLong()).isEqualTo(partitionStatisticsAfter.getNumFiles().getAsLong()); + assertThat(partitionStatisticsBefore.getRawDataSize().getAsLong()).isEqualTo(partitionStatisticsAfter.getRawDataSize().getAsLong()); + assertThat(partitionStatisticsBefore.getTotalSize().getAsLong()).isEqualTo(partitionStatisticsAfter.getTotalSize().getAsLong()); + } + finally { + onPresto().executeQuery(format("DROP TABLE %s", tableName)); + } + } + + @Test(groups = {HIVE_TABLE_STATISTICS}) + public void testAnalyzeUnpartitioned() + { + String tableName = "test_basic_statistics_analyze_unpartitioned"; + + onPresto().executeQuery("DROP TABLE IF EXISTS " + tableName); + onPresto().executeQuery(format("" + + "CREATE TABLE %s " + + "AS " + + "SELECT n_nationkey, n_name, n_comment, n_regionkey " + + "FROM nation " + + "WHERE n_regionkey = 1", tableName)); + + try { + BasicStatistics tableStatisticsBefore = getBasicStatisticsForTable(onHive(), tableName); + assertThatStatisticsArePresent(tableStatisticsBefore); + + // run ANALYZE + onPresto().executeQuery(format("ANALYZE %s", tableName)); + BasicStatistics tableStatisticsAfter = getBasicStatisticsForTable(onHive(), tableName); + assertThatStatisticsArePresent(tableStatisticsAfter); + + // ANALYZE must not change the basic stats + assertThat(tableStatisticsBefore.getNumRows()).isEqualTo(tableStatisticsAfter.getNumRows()); + assertThat(tableStatisticsBefore.getNumFiles()).isEqualTo(tableStatisticsAfter.getNumFiles()); + assertThat(tableStatisticsBefore.getRawDataSize()).isEqualTo(tableStatisticsAfter.getRawDataSize()); + assertThat(tableStatisticsBefore.getTotalSize()).isEqualTo(tableStatisticsAfter.getTotalSize()); + } + finally { + onPresto().executeQuery(format("DROP TABLE %s", tableName)); + } + } + @Test(groups = {HIVE_TABLE_STATISTICS}) public void testInsertPartitioned() { diff --git a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java index 17cd7711eff9f..bc107b40cf608 100644 --- a/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java +++ b/presto-product-tests/src/main/java/com/facebook/presto/tests/hive/TestHiveTableStatistics.java @@ -606,6 +606,375 @@ public void testStatisticsForAllDataTypesOnlyNulls() row(null, null, null, null, 1.0, null, null)); } + @Test + @Requires(UnpartitionedNationTable.class) + public void testStatisticsForSkewedTable() + { + String tableName = "test_hive_skewed_table_statistics"; + onHive().executeQuery("DROP TABLE IF EXISTS " + tableName); + onHive().executeQuery("CREATE TABLE " + tableName + " (c_string STRING, c_int INT) SKEWED BY (c_string) ON ('c1')"); + onHive().executeQuery("INSERT INTO TABLE " + tableName + " VALUES ('c1', 1), ('c1', 2)"); + + assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( + row("c_string", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row(null, null, null, null, 2.0, null, null)); + + onHive().executeQuery("ANALYZE TABLE " + tableName + " COMPUTE STATISTICS"); + + assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( + row("c_string", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row(null, null, null, null, 2.0, null, null)); + + onHive().executeQuery("ANALYZE TABLE " + tableName + " COMPUTE STATISTICS FOR COLUMNS"); + assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( + row("c_string", 4.0, 1.0, 0.0, null, null, null), + row("c_int", null, 2.0, 0.0, null, "1", "2"), + row(null, null, null, null, 2.0, null, null)); + } + + @Test + @Requires(UnpartitionedNationTable.class) + public void testAnalyzesForSkewedTable() + { + String tableName = "test_analyze_skewed_table"; + onHive().executeQuery("DROP TABLE IF EXISTS " + tableName); + onHive().executeQuery("CREATE TABLE " + tableName + " (c_string STRING, c_int INT) SKEWED BY (c_string) ON ('c1')"); + onHive().executeQuery("INSERT INTO TABLE " + tableName + " VALUES ('c1', 1), ('c1', 2)"); + + assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( + row("c_string", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row(null, null, null, null, 2.0, null, null)); + + assertThat(query("ANALYZE " + tableName)).containsExactly(row(2)); + assertThat(query("SHOW STATS FOR " + tableName)).containsOnly( + row("c_string", 4.0, 1.0, 0.0, null, null, null), + row("c_int", null, 2.0, 0.0, null, "1", "2"), + row(null, null, null, null, 2.0, null, null)); + } + + @Test + @Requires(UnpartitionedNationTable.class) + public void testAnalyzeForUnpartitionedTable() + { + String tableNameInDatabase = mutableTablesState().get(NATION.getName()).getNameInDatabase(); + + String showStatsWholeTable = "SHOW STATS FOR " + tableNameInDatabase; + + // table not analyzed + assertThat(query(showStatsWholeTable)).containsOnly( + row("n_nationkey", null, null, anyOf(null, 0.0), null, null, null), + row("n_name", null, null, anyOf(null, 0.0), null, null, null), + row("n_regionkey", null, null, anyOf(null, 0.0), null, null, null), + row("n_comment", null, null, anyOf(null, 0.0), null, null, null), + row(null, null, null, null, anyOf(null, 0.0), null, null)); // anyOf because of different behaviour on HDP (hive 1.2) and CDH (hive 1.1) + + assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(25)); + + assertThat(query(showStatsWholeTable)).containsOnly( + row("n_nationkey", null, 25.0, 0.0, null, "0", "24"), + row("n_name", 177.0, 25.0, 0.0, null, null, null), + row("n_regionkey", null, 5.0, 0.0, null, "0", "4"), + row("n_comment", 1857.0, 25.0, 0.0, null, null, null), + row(null, null, null, null, 25.0, null, null)); + } + + @Test + @Requires(NationPartitionedByBigintTable.class) + public void testAnalyzeForTablePartitionedByBigint() + { + String tableNameInDatabase = mutableTablesState().get(NATION_PARTITIONED_BY_BIGINT_REGIONKEY.getName()).getNameInDatabase(); + + String showStatsWholeTable = "SHOW STATS FOR " + tableNameInDatabase; + String showStatsPartitionOne = "SHOW STATS FOR (SELECT * FROM " + tableNameInDatabase + " WHERE p_regionkey = 1)"; + String showStatsPartitionTwo = "SHOW STATS FOR (SELECT * FROM " + tableNameInDatabase + " WHERE p_regionkey = 2)"; + + // table not analyzed + + assertThat(query(showStatsWholeTable)).containsOnly( + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); + + assertThat(query(showStatsPartitionOne)).containsOnly( + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); + + // analyze for single partition + + assertThat(query("ANALYZE " + tableNameInDatabase + " WITH (partitions = ARRAY[ARRAY['1']])")).containsExactly(row(5)); + + assertThat(query(showStatsWholeTable)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 114.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); + + assertThat(query(showStatsPartitionOne)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); + + assertThat(query(showStatsPartitionTwo)).containsOnly( + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); + + // analyze for all partitions + + assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(15)); + + assertThat(query(showStatsWholeTable)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 109.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 3.0, 0.0, null, "1", "3"), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); + + assertThat(query(showStatsPartitionOne)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "1", "1"), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); + + assertThat(query(showStatsPartitionTwo)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "8", "21"), + row("p_name", 31.0, 5.0, 0.0, null, null, null), + row("p_regionkey", null, 1.0, 0.0, null, "2", "2"), + row("p_comment", 351.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); + } + + @Test + @Requires(NationPartitionedByVarcharTable.class) + public void testAnalyzeForTablePartitionedByVarchar() + { + String tableNameInDatabase = mutableTablesState().get(NATION_PARTITIONED_BY_VARCHAR_REGIONKEY.getName()).getNameInDatabase(); + + String showStatsWholeTable = "SHOW STATS FOR " + tableNameInDatabase; + String showStatsPartitionOne = "SHOW STATS FOR (SELECT * FROM " + tableNameInDatabase + " WHERE p_regionkey = 'AMERICA')"; + String showStatsPartitionTwo = "SHOW STATS FOR (SELECT * FROM " + tableNameInDatabase + " WHERE p_regionkey = 'ASIA')"; + + // table not analyzed + + assertThat(query(showStatsWholeTable)).containsOnly( + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); + + assertThat(query(showStatsPartitionOne)).containsOnly( + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); + + // analyze for single partition + + assertThat(query("ANALYZE " + tableNameInDatabase + " WITH (partitions = ARRAY[ARRAY['AMERICA']])")).containsExactly(row(5)); + + assertThat(query(showStatsWholeTable)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 114.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", 1497.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); + + assertThat(query(showStatsPartitionOne)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); + + assertThat(query(showStatsPartitionTwo)).containsOnly( + row("p_nationkey", null, null, null, null, null, null), + row("p_name", null, null, null, null, null, null), + row("p_regionkey", null, null, null, null, null, null), + row("p_comment", null, null, null, null, null, null), + row(null, null, null, null, null, null, null)); + + // column analysis for all partitions + + assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(15)); + + assertThat(query(showStatsWholeTable)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 109.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 85.0, 3.0, 0.0, null, null, null), + row("p_comment", 1197.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 15.0, null, null)); + + assertThat(query(showStatsPartitionOne)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "1", "24"), + row("p_name", 38.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 35.0, 1.0, 0.0, null, null, null), + row("p_comment", 499.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); + + assertThat(query(showStatsPartitionTwo)).containsOnly( + row("p_nationkey", null, 5.0, 0.0, null, "8", "21"), + row("p_name", 31.0, 5.0, 0.0, null, null, null), + row("p_regionkey", 20.0, 1.0, 0.0, null, null, null), + row("p_comment", 351.0, 5.0, 0.0, null, null, null), + row(null, null, null, null, 5.0, null, null)); + } + + // This covers also stats calculation for unpartitioned table + @Test(groups = {SKIP_ON_CDH}) // skip on cdh due to no support for date column and stats + @Requires(AllTypesTable.class) + public void testAnalyzeForAllDataTypes() + { + String tableNameInDatabase = mutableTablesState().get(ALL_TYPES_TABLE_NAME).getNameInDatabase(); + + assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 0.0, null, null)); + + assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(2)); + + // SHOW STATS FORMAT: column_name, data_size, distinct_values_count, nulls_fraction, row_count + assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( + row("c_tinyint", null, 2.0, 0.0, null, "121", "127"), + row("c_smallint", null, 2.0, 0.0, null, "32761", "32767"), + row("c_int", null, 2.0, 0.0, null, "2147483641", "2147483647"), + row("c_bigint", null, 2.0, 0.0, null, "9223372036854775807", "9223372036854775807"), + row("c_float", null, 2.0, 0.0, null, "123.341", "123.345"), + row("c_double", null, 2.0, 0.0, null, "234.561", "235.567"), + row("c_decimal", null, 2.0, 0.0, null, "345.0", "346.0"), + row("c_decimal_w_params", null, 2.0, 0.0, null, "345.671", "345.678"), + row("c_timestamp", null, 2.0, 0.0, null, null, null), + row("c_date", null, 2.0, 0.0, null, "2015-05-09", "2015-06-10"), + row("c_string", 22.0, 2.0, 0.0, null, null, null), + row("c_varchar", 20.0, 2.0, 0.0, null, null, null), + row("c_char", 12.0, 2.0, 0.0, null, null, null), + row("c_boolean", null, 2.0, 0.0, null, null, null), + row("c_binary", 23.0, null, 0.0, null, null, null), + row(null, null, null, null, 2.0, null, null)); + } + + @Test(groups = {SKIP_ON_CDH}) // skip on cdh due to no support for date column and stats + @Requires(AllTypesTable.class) + public void testAnalyzeForAllDataTypesNoData() + { + String tableNameInDatabase = mutableTablesState().get(EMPTY_ALL_TYPES_TABLE_NAME).getNameInDatabase(); + + assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 0.0, null, null)); + + assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(0)); + + assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( + row("c_tinyint", null, 0.0, 0.0, null, null, null), + row("c_smallint", null, 0.0, 0.0, null, null, null), + row("c_int", null, 0.0, 0.0, null, null, null), + row("c_bigint", null, 0.0, 0.0, null, null, null), + row("c_float", null, 0.0, 0.0, null, null, null), + row("c_double", null, 0.0, 0.0, null, null, null), + row("c_decimal", null, 0.0, 0.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 0.0, null, null, null), + row("c_timestamp", null, 0.0, 0.0, null, null, null), + row("c_date", null, 0.0, 0.0, null, null, null), + row("c_string", 0.0, 0.0, 0.0, null, null, null), + row("c_varchar", 0.0, 0.0, 0.0, null, null, null), + row("c_char", 0.0, 0.0, 0.0, null, null, null), + row("c_boolean", null, 0.0, 0.0, null, null, null), + row("c_binary", 0.0, null, 0.0, null, null, null), + row(null, null, null, null, 0.0, null, null)); + } + + @Test(groups = {SKIP_ON_CDH}) // skip on cdh due to no support for date column and stats + @Requires(AllTypesTable.class) + public void testAnalyzeForAllDataTypesOnlyNulls() + { + String tableNameInDatabase = mutableTablesState().get(EMPTY_ALL_TYPES_TABLE_NAME).getNameInDatabase(); + + // insert from hive to prevent Presto collecting statistics on insert + onHive().executeQuery("INSERT INTO TABLE " + tableNameInDatabase + " VALUES(null, null, null, null, null, null, null, null, null, null, null, null, null, null, null)"); + + assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( + row("c_tinyint", null, null, null, null, null, null), + row("c_smallint", null, null, null, null, null, null), + row("c_int", null, null, null, null, null, null), + row("c_bigint", null, null, null, null, null, null), + row("c_float", null, null, null, null, null, null), + row("c_double", null, null, null, null, null, null), + row("c_decimal", null, null, null, null, null, null), + row("c_decimal_w_params", null, null, null, null, null, null), + row("c_timestamp", null, null, null, null, null, null), + row("c_date", null, null, null, null, null, null), + row("c_string", null, null, null, null, null, null), + row("c_varchar", null, null, null, null, null, null), + row("c_char", null, null, null, null, null, null), + row("c_boolean", null, null, null, null, null, null), + row("c_binary", null, null, null, null, null, null), + row(null, null, null, null, 1.0, null, null)); + + assertThat(query("ANALYZE " + tableNameInDatabase)).containsExactly(row(1)); + + assertThat(query("SHOW STATS FOR " + tableNameInDatabase)).containsOnly( + row("c_tinyint", null, 0.0, 1.0, null, null, null), + row("c_smallint", null, 0.0, 1.0, null, null, null), + row("c_int", null, 0.0, 1.0, null, null, null), + row("c_bigint", null, 0.0, 1.0, null, null, null), + row("c_float", null, 0.0, 1.0, null, null, null), + row("c_double", null, 0.0, 1.0, null, null, null), + row("c_decimal", null, 0.0, 1.0, null, null, null), + row("c_decimal_w_params", null, 0.0, 1.0, null, null, null), + row("c_timestamp", null, 0.0, 1.0, null, null, null), + row("c_date", null, 0.0, 1.0, null, null, null), + row("c_string", 0.0, 0.0, 1.0, null, null, null), + row("c_varchar", 0.0, 0.0, 1.0, null, null, null), + row("c_char", 0.0, 0.0, 1.0, null, null, null), + row("c_boolean", null, 0.0, 1.0, null, null, null), + row("c_binary", 0.0, null, 1.0, null, null, null), + row(null, null, null, null, 1.0, null, null)); + } + @Test @Requires(AllTypesTable.class) public void testComputeTableStatisticsOnCreateTable() diff --git a/presto-tests/src/test/java/com/facebook/presto/tests/TestTpchDistributedQueries.java b/presto-tests/src/test/java/com/facebook/presto/tests/TestTpchDistributedQueries.java index 85d07b506557c..4c75e04741a80 100644 --- a/presto-tests/src/test/java/com/facebook/presto/tests/TestTpchDistributedQueries.java +++ b/presto-tests/src/test/java/com/facebook/presto/tests/TestTpchDistributedQueries.java @@ -36,6 +36,13 @@ public void testTooLongQuery() assertQueryFails(longQuery, "Query text length \\(1000037\\) exceeds the maximum length \\(1000000\\)"); } + @Test + public void testAnalyze() + { + assertUpdate("ANALYZE orders", 15000); + assertQueryFails("ANALYZE orders WITH (foo = 'bar')", ".* does not support analyze property 'foo'.*"); + } + @Test public void testTooManyStages() {