Skip to content

Commit

Permalink
[HOPSWORKS-2280] Save statistics configuration also for training data…
Browse files Browse the repository at this point in the history
…sets (#217)
  • Loading branch information
moritzmeister committed Jan 20, 2021
1 parent 9302c27 commit 4c48a3b
Show file tree
Hide file tree
Showing 22 changed files with 344 additions and 127 deletions.
12 changes: 12 additions & 0 deletions auto_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,12 @@
"hsfs.constructor.query.Query"
),
},
"statistics.md": {
"statistics_config": ["hsfs.statistics_config.StatisticsConfig"],
"statistics_config_properties": keras_autodoc.get_properties(
"hsfs.statistics_config.StatisticsConfig"
),
},
"api/connection_api.md": {
"connection": ["hsfs.connection.Connection"],
"connection_properties": keras_autodoc.get_properties(
Expand Down Expand Up @@ -130,6 +136,12 @@
"hsfs.storage_connector.StorageConnector"
),
},
"api/statistics_config_api.md": {
"statistics_config": ["hsfs.statistics_config.StatisticsConfig"],
"statistics_config_properties": keras_autodoc.get_properties(
"hsfs.statistics_config.StatisticsConfig"
),
},
}

hsfs_dir = pathlib.Path(__file__).resolve().parents[0]
Expand Down
7 changes: 7 additions & 0 deletions docs/templates/api/statistics_config_api.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# StatisticsConfig

{{statistics_config}}

## Properties

{{statistics_config_properties}}
44 changes: 44 additions & 0 deletions docs/templates/statistics.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Statistics

HSFS provides functionality to compute statistics for [training datasets](training_dataset.md) and [feature groups](feature_group.md) and save these along with their other metadata in the [feature store](feature_store.md).
These statistics are meant to be helpful for Data Scientists to perform explorative data analysis and then recognize suitable [features](feature.md) or [training datasets](training_dataset.md) for models.

Statistics are configured on a training dataset or feature group level using a `StatisticsConfig` object.
This object can be passed at creation time of the dataset or group or it can later on be updated through the API.

{{statistics_config}}

For example, to enable all statistics (descriptive, histograms and correlations) for a training dataset:

=== "Python"
```python
from hsfs.statistics_config import StatisticsConfig

td = fs.create_training_dataset("rain_dataset",
version=1,
label=”weekly_rain”,
data_format=”tfrecords”,
statistics_config=StatisticsConfig(true, true, true))

```
=== "Scala"
```scala
val td = (fs.createTrainingDataset()
.name("rain_dataset")
.version(1)
.label(”weekly_rain”)
.dataFormat(”tfrecords”)
.statisticsConfig(new StatisticsConfig(true, true, true))
.build())
```

And similarly for feature groups.

!!! note "Default StatisticsConfig"
By default all training datasets and feature groups will be configured such that only descriptive statistics
are computed. However, you can also enable `histograms` and `correlations` or limit the features for which
statistics are computed.

## Properties

{{statistics_config_properties}}
10 changes: 3 additions & 7 deletions java/src/main/java/com/logicalclocks/hsfs/FeatureGroup.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ public class FeatureGroup extends FeatureGroupBase {
public FeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String description,
List<String> primaryKeys, List<String> partitionKeys, String hudiPrecombineKey,
boolean onlineEnabled, TimeTravelFormat timeTravelFormat, List<Feature> features,
Boolean statisticsEnabled, Boolean histograms, Boolean correlations,
List<String> statisticColumns) {
StatisticsConfig statisticsConfig) {
this.featureStore = featureStore;
this.name = name;
this.version = version;
Expand All @@ -87,10 +86,7 @@ public FeatureGroup(FeatureStore featureStore, @NonNull String name, Integer ver
this.onlineEnabled = onlineEnabled;
this.timeTravelFormat = timeTravelFormat != null ? timeTravelFormat : TimeTravelFormat.HUDI;
this.features = features;
this.statisticsEnabled = statisticsEnabled != null ? statisticsEnabled : true;
this.histograms = histograms;
this.correlations = correlations;
this.statisticColumns = statisticColumns;
this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig();
}

public FeatureGroup() {
Expand Down Expand Up @@ -183,7 +179,7 @@ public void save(Dataset<Row> featureData, Map<String, String> writeOptions)
throws FeatureStoreException, IOException {
featureGroupEngine.saveFeatureGroup(this, featureData, primaryKeys, partitionKeys, hudiPrecombineKey,
writeOptions);
if (statisticsEnabled) {
if (statisticsConfig.getEnabled()) {
statisticsEngine.computeStatistics(this, featureData);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ public class OnDemandFeatureGroup extends FeatureGroupBase {
public OnDemandFeatureGroup(FeatureStore featureStore, @NonNull String name, Integer version, String query,
OnDemandDataFormat dataFormat, String path, Map<String, String> options,
@NonNull StorageConnector storageConnector, String description, List<Feature> features,
Boolean statisticsEnabled, Boolean histograms, Boolean correlations,
List<String> statisticColumns) {
StatisticsConfig statisticsConfig) {
this.featureStore = featureStore;
this.name = name;
this.version = version;
Expand All @@ -83,10 +82,7 @@ public OnDemandFeatureGroup(FeatureStore featureStore, @NonNull String name, Int
this.description = description;
this.storageConnector = storageConnector;
this.features = features;
this.statisticsEnabled = statisticsEnabled != null ? statisticsEnabled : true;
this.histograms = histograms;
this.correlations = correlations;
this.statisticColumns = statisticColumns;
this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig();
}

public OnDemandFeatureGroup() {
Expand All @@ -95,7 +91,7 @@ public OnDemandFeatureGroup() {
public void save() throws FeatureStoreException, IOException {
onDemandFeatureGroupEngine.saveFeatureGroup(this);

if (statisticsEnabled) {
if (statisticsConfig.getEnabled()) {
statisticsEngine.computeStatistics(this, read());
}
}
Expand Down
53 changes: 53 additions & 0 deletions java/src/main/java/com/logicalclocks/hsfs/StatisticsConfig.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2021 Logical Clocks AB
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* See the License for the specific language governing permissions and limitations under the License.
*/

package com.logicalclocks.hsfs;

import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;

import java.util.ArrayList;
import java.util.List;

@AllArgsConstructor
@NoArgsConstructor
@Builder
public class StatisticsConfig {
@Getter
@Setter
private Boolean enabled = true;

@Getter
@Setter
private Boolean histograms = false;

@Getter
@Setter
private Boolean correlations = false;

@Getter
@Setter
private List<String> columns = new ArrayList<>();

public StatisticsConfig(Boolean enabled, Boolean histograms, Boolean correlations) {
this.enabled = enabled;
this.histograms = histograms;
this.correlations = correlations;
}
}
42 changes: 17 additions & 25 deletions java/src/main/java/com/logicalclocks/hsfs/TrainingDataset.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,23 +90,7 @@ public class TrainingDataset {

@Getter
@Setter
@JsonIgnore
private Boolean statisticsEnabled = true;

@Getter
@Setter
@JsonIgnore
private Boolean histograms;

@Getter
@Setter
@JsonIgnore
private Boolean correlations;

@Getter
@Setter
@JsonIgnore
private List<String> statisticColumns;
private StatisticsConfig statisticsConfig = new StatisticsConfig();

@Getter
@Setter
Expand All @@ -123,8 +107,7 @@ public class TrainingDataset {
@Builder
public TrainingDataset(@NonNull String name, Integer version, String description, DataFormat dataFormat,
StorageConnector storageConnector, String location, List<Split> splits, Long seed,
FeatureStore featureStore, Boolean statisticsEnabled, Boolean histograms,
Boolean correlations, List<String> statisticColumns, List<String> label) {
FeatureStore featureStore, StatisticsConfig statisticsConfig, List<String> label) {
this.name = name;
this.version = version;
this.description = description;
Expand All @@ -142,10 +125,7 @@ public TrainingDataset(@NonNull String name, Integer version, String description
this.splits = splits;
this.seed = seed;
this.featureStore = featureStore;
this.statisticsEnabled = statisticsEnabled != null ? statisticsEnabled : true;
this.histograms = histograms;
this.correlations = correlations;
this.statisticColumns = statisticColumns;
this.statisticsConfig = statisticsConfig != null ? statisticsConfig : new StatisticsConfig();
this.label = label;
}

Expand Down Expand Up @@ -195,7 +175,7 @@ public void save(Query query, Map<String, String> writeOptions) throws FeatureSt
public void save(Dataset<Row> dataset, Map<String, String> writeOptions)
throws FeatureStoreException, IOException {
trainingDatasetEngine.save(this, dataset, writeOptions, label);
if (statisticsEnabled) {
if (statisticsConfig.getEnabled()) {
statisticsEngine.computeStatistics(this, dataset);
}
}
Expand Down Expand Up @@ -314,12 +294,24 @@ public void show(int numRows) {
* @throws IOException
*/
public Statistics computeStatistics() throws FeatureStoreException, IOException {
if (statisticsEnabled) {
if (statisticsConfig.getEnabled()) {
return statisticsEngine.computeStatistics(this, read());
}
return null;
}

/**
* Update the statistics configuration of the training dataset.
* Change the `enabled`, `histograms`, `correlations` or `columns` attributes and persist
* the changes by calling this method.
*
* @throws FeatureStoreException
* @throws IOException
*/
public void updateStatisticsConfig() throws FeatureStoreException, IOException {
trainingDatasetEngine.updateStatisticsConfig(this);
}

/**
* Get the last statistics commit for the training dataset.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,10 @@ public void appendFeatures(FeatureGroupBase featureGroup, List<Feature> features
FeatureGroup apiFG = featureGroupApi.updateMetadata(fgBaseSend, "updateMetadata");
featureGroup.setFeatures(apiFG.getFeatures());
}

public void updateStatisticsConfig(FeatureGroupBase featureGroup) throws FeatureStoreException, IOException {
FeatureGroup apiFG = featureGroupApi.updateMetadata(featureGroup, "updateStatsSettings");
featureGroup.setCorrelations(apiFG.getCorrelations());
featureGroup.setHistograms(apiFG.getHistograms());
public void updateStatisticsConfig(FeatureGroup featureGroup) throws FeatureStoreException, IOException {
FeatureGroup apiFG = featureGroupApi.updateMetadata(featureGroup, "updateStatsConfig");
featureGroup.getStatisticsConfig().setCorrelations(apiFG.getStatisticsConfig().getCorrelations());
featureGroup.getStatisticsConfig().setHistograms(apiFG.getStatisticsConfig().getHistograms());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,7 @@ public void saveFeatureGroup(FeatureGroup featureGroup, Dataset<Row> dataset, Li
featureGroup.setVersion(apiFG.getVersion());
featureGroup.setLocation(apiFG.getLocation());
featureGroup.setId(apiFG.getId());
featureGroup.setCorrelations(apiFG.getCorrelations());
featureGroup.setHistograms(apiFG.getHistograms());
featureGroup.setStatisticsConfig(apiFG.getStatisticsConfig());

/* if hudi precombine key was not provided and TimeTravelFormat is HUDI, retrieve from backend and set */
if (featureGroup.getTimeTravelFormat() == TimeTravelFormat.HUDI & hudiPrecombineKey == null) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,17 @@ public StatisticsEngine(EntityEndpointType entityType) {

public Statistics computeStatistics(TrainingDataset trainingDataset, Dataset<Row> dataFrame)
throws FeatureStoreException, IOException {
return statisticsApi.post(trainingDataset, computeStatistics(dataFrame, trainingDataset.getStatisticColumns(),
trainingDataset.getHistograms(), trainingDataset.getCorrelations()));
return statisticsApi.post(trainingDataset, computeStatistics(dataFrame,
trainingDataset.getStatisticsConfig().getColumns(),
trainingDataset.getStatisticsConfig().getHistograms(),
trainingDataset.getStatisticsConfig().getCorrelations()));
}

public Statistics computeStatistics(FeatureGroupBase featureGroup, Dataset<Row> dataFrame)
throws FeatureStoreException, IOException {
return statisticsApi.post(featureGroup, computeStatistics(dataFrame, featureGroup.getStatisticColumns(),
featureGroup.getHistograms(), featureGroup.getCorrelations()));
return statisticsApi.post(featureGroup, computeStatistics(dataFrame,
featureGroup.getStatisticsConfig().getColumns(),
featureGroup.getStatisticsConfig().getHistograms(), featureGroup.getStatisticsConfig().getCorrelations()));
}

private Statistics computeStatistics(Dataset<Row> dataFrame, List<String> statisticColumns, Boolean histograms,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,10 @@ public String getQuery(TrainingDataset trainingDataset, Storage storage, boolean
throws FeatureStoreException, IOException {
return trainingDatasetApi.getQuery(trainingDataset, withLabel).getStorageQuery(storage);
}

public void updateStatisticsConfig(TrainingDataset trainingDataset) throws FeatureStoreException, IOException {
TrainingDataset apiTD = trainingDatasetApi.updateMetadata(trainingDataset, "updateStatsConfig");
trainingDataset.getStatisticsConfig().setCorrelations(apiTD.getStatisticsConfig().getCorrelations());
trainingDataset.getStatisticsConfig().setHistograms(apiTD.getStatisticsConfig().getHistograms());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class FeatureGroupApi {

public static final String FEATURE_GROUP_ROOT_PATH = "/featuregroups";
public static final String FEATURE_GROUP_PATH = FEATURE_GROUP_ROOT_PATH + "{/fgName}{?version}";
public static final String FEATURE_GROUP_ID_PATH = FEATURE_GROUP_ROOT_PATH + "{/fgId}{?updateStatsSettings,"
public static final String FEATURE_GROUP_ID_PATH = FEATURE_GROUP_ROOT_PATH + "{/fgId}{?updateStatsConfig,"
+ "updateMetadata}";
public static final String FEATURE_GROUP_COMMIT_PATH = FEATURE_GROUP_ID_PATH
+ "/commits{?sort_by,offset,limit}";
Expand Down
Loading

0 comments on commit 4c48a3b

Please sign in to comment.