Skip to content

Commit

Permalink
fix(startf/ds.set_body): infer structure when ds.set_body is called
Browse files Browse the repository at this point in the history
lots of tiny problems were arising from starlark breaking the assertion that
any dataset with a body must also have a structure that dictates how to read
the body. Fixing this at the source
  • Loading branch information
b5 committed Mar 13, 2021
1 parent 30e9aa9 commit a8a3492
Show file tree
Hide file tree
Showing 9 changed files with 20 additions and 150 deletions.
65 changes: 3 additions & 62 deletions base/dataset_prepare.go
Original file line number Diff line number Diff line change
@@ -1,18 +1,15 @@
package base

import (
"bytes"
"context"
"errors"
"fmt"
"io"
"path/filepath"
"strings"

"github.com/qri-io/dataset"
"github.com/qri-io/dataset/detect"
"github.com/qri-io/dataset/validate"
"github.com/qri-io/qfs"
"github.com/qri-io/qri/dsref"
qerr "github.com/qri-io/qri/errors"
"github.com/qri-io/qri/logbook"
Expand Down Expand Up @@ -140,14 +137,10 @@ func InferValues(pro *profile.Profile, ds *dataset.Dataset) error {
// NOTE: add author ProfileID here to keep the dataset package agnostic to
// all identity stuff except keypair crypto
ds.Commit.Author = &dataset.User{ID: pro.ID.String()}
// TODO - infer title & message

// if we don't have a structure or schema then attempt to determine one
body := ds.BodyFile()
if body != nil && (ds.Structure == nil || ds.Structure.Schema == nil) {
if err := InferStructure(ds); err != nil {
return err
}
// add any missing structure fields
if err := detect.Structure(ds); err != nil && !errors.Is(err, dataset.ErrNoBody) {
return err
}

if ds.Transform != nil && ds.Transform.ScriptFile() == nil && ds.Transform.IsEmpty() {
Expand All @@ -157,57 +150,6 @@ func InferValues(pro *profile.Profile, ds *dataset.Dataset) error {
return nil
}

// InferStructure infers the Structure field of the dataset, guaranteeing
// that the structure will contain a Format, FormatConfig, and Schema, based
// on the given dataset body. It will not write over any Structure, Format,
// FormatConfig, or Schema that already exists.
func InferStructure(ds *dataset.Dataset) error {
if ds == nil {
return fmt.Errorf("empty dataset")
}

body := ds.BodyFile()
if body == nil {
return fmt.Errorf("empty body")
}
// use a TeeReader that writes to a buffer to preserve data
buf := &bytes.Buffer{}
tr := io.TeeReader(body, buf)
var df dataset.DataFormat

df, err := detect.ExtensionDataFormat(body.FileName())
if err != nil {
log.Debug(err.Error())
return fmt.Errorf("invalid data format: %s", err.Error())
}

guessedStructure, _, err := detect.FromReader(df, tr)
if err != nil {
log.Debug(err.Error())
return fmt.Errorf("determining dataset structure: %s", err.Error())
}

// attach the structure, schema, and formatConfig, as appropriate
if ds.Structure == nil {
ds.Structure = guessedStructure
}
if ds.Structure.Schema == nil {
ds.Structure.Schema = guessedStructure.Schema
}
if ds.Structure.FormatConfig == nil {
ds.Structure.FormatConfig = guessedStructure.FormatConfig
}
if ds.Structure.Format == "" {
ds.Structure.Format = guessedStructure.Format
}

// glue whatever we just read back onto the reader
// TODO (b5)- this may ruin readers that transparently depend on a read-closer
// we should consider a method on qfs.File that allows this non-destructive read pattern
ds.SetBodyFile(qfs.NewMemfileReader(body.FileName(), io.MultiReader(buf, body)))
return nil
}

// ValidateDataset checks that a dataset is semantically valid
func ValidateDataset(ds *dataset.Dataset) (err error) {
// Ensure that dataset structure is valid
Expand All @@ -216,6 +158,5 @@ func ValidateDataset(ds *dataset.Dataset) (err error) {
err = fmt.Errorf("invalid dataset: %s", err.Error())
return
}

return nil
}
60 changes: 3 additions & 57 deletions base/dataset_prepare_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,60 +104,6 @@ func TestInferValues(t *testing.T) {
}
}

func TestInferStructure(t *testing.T) {
ds := &dataset.Dataset{
Name: "animals",
}
ds.SetBodyFile(qfs.NewMemfileBytes("animals.csv",
[]byte("Animal,Sound,Weight\ncat,meow,1.4\ndog,bark,3.7\n")))

if err := InferStructure(ds); err != nil {
t.Error(err)
}

if ds.Structure.Format != "csv" {
t.Errorf("expected format CSV, got %s", ds.Structure.Format)
}
if ds.Structure.FormatConfig["headerRow"] != true {
t.Errorf("expected format config to set headerRow set to true")
}

actual := datasetSchemaToJSON(ds)
expect := `{"items":{"items":[{"title":"animal","type":"string"},{"title":"sound","type":"string"},{"title":"weight","type":"number"}],"type":"array"},"type":"array"}`

if expect != actual {
t.Errorf("mismatched schema, expected \"%s\", got \"%s\"", expect, actual)
}
}

func TestInferStructureSchema(t *testing.T) {
ds := &dataset.Dataset{
Name: "animals",
Structure: &dataset.Structure{
Format: "csv",
},
}
ds.SetBodyFile(qfs.NewMemfileBytes("animals.csv",
[]byte("Animal,Sound,Weight\ncat,meow,1.4\ndog,bark,3.7\n")))
if err := InferStructure(ds); err != nil {
t.Error(err)
}

if ds.Structure.Format != "csv" {
t.Errorf("expected format CSV, got %s", ds.Structure.Format)
}
if ds.Structure.FormatConfig["headerRow"] != true {
t.Errorf("expected format config to set headerRow set to true")
}

actual := datasetSchemaToJSON(ds)
expect := `{"items":{"items":[{"title":"animal","type":"string"},{"title":"sound","type":"string"},{"title":"weight","type":"number"}],"type":"array"},"type":"array"}`

if expect != actual {
t.Errorf("mismatched schema, expected \"%s\", got \"%s\"", expect, actual)
}
}

func TestInferValuesDontOverwriteSchema(t *testing.T) {
r := newTestRepo(t)
pro := r.Profiles().Owner()
Expand Down Expand Up @@ -188,15 +134,15 @@ func TestInferValuesDontOverwriteSchema(t *testing.T) {
if ds.Structure.Format != "csv" {
t.Errorf("expected format CSV, got %s", ds.Structure.Format)
}
if ds.Structure.FormatConfig != nil {
t.Errorf("expected format config to be nil")
if ds.Structure.FormatConfig == nil {
t.Errorf("expected format config to be non-nil")
}

actual := datasetSchemaToJSON(ds)
expect := `{"items":{"items":[{"title":"animal","type":"number"},{"title":"noise","type":"number"},{"title":"height","type":"number"}],"type":"array"},"type":"array"}`

if expect != actual {
t.Errorf("mismatched schema, expected \"%s\", got \"%s\"", expect, actual)
t.Errorf("mismatched schema, expected %q, got %q", expect, actual)
}
}

Expand Down
3 changes: 2 additions & 1 deletion changes/changes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

"github.com/google/go-cmp/cmp"
"github.com/qri-io/dataset"
"github.com/qri-io/dataset/detect"
"github.com/qri-io/dataset/tabular"
"github.com/qri-io/qfs"
"github.com/qri-io/qri/base"
Expand Down Expand Up @@ -743,7 +744,7 @@ func (run *testRunner) updateDataset(t *testing.T, ds *dataset.Dataset, newBody

// force recalculate structure as that is what we rely on for the change reports
ds.Structure = nil
if err := base.InferStructure(ds); err != nil {
if err := detect.Structure(ds); err != nil {
t.Fatal(err.Error())
}

Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ require (
github.com/olekukonko/tablewriter v0.0.4
github.com/pkg/errors v0.9.1
github.com/qri-io/dag v0.2.2-0.20201208212257-ae00241c4b48
github.com/qri-io/dataset v0.2.1-0.20210304141850-a4a809d46350
github.com/qri-io/dataset v0.2.1-0.20210312210644-ba8eaa336c8d
github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b
github.com/qri-io/didmod v0.0.0-20201123165422-8b2e224c993a
github.com/qri-io/doggos v0.1.0
Expand Down
6 changes: 2 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -1131,10 +1131,8 @@ github.com/qri-io/compare v0.1.0 h1:A/MRx3uEnJ/iMjfJY1VOqH9CYs9zFSEYaFVeXuGfmis=
github.com/qri-io/compare v0.1.0/go.mod h1:i/tVuDGRXVxhuZ8ZUieF23u6rQ6wLGJl7KKWpoMRaTE=
github.com/qri-io/dag v0.2.2-0.20201208212257-ae00241c4b48 h1:6fTW2iHGbaEKQt9u8+04kB3m33KSGLqxF/2pWNleeEg=
github.com/qri-io/dag v0.2.2-0.20201208212257-ae00241c4b48/go.mod h1:1AwOy3yhcZTAXzaF4wGSdnrp87u3PBOrsWXUjOtQCXo=
github.com/qri-io/dataset v0.2.1-0.20210128201320-3b1209495e96 h1:SiP48nzhKLJbvM6SA+5wK53PKUs0FY0DWDylMPyi8S4=
github.com/qri-io/dataset v0.2.1-0.20210128201320-3b1209495e96/go.mod h1:vlq9+Nu37koO3mrp25QGNOt68CLe2d2rAtB9cnDLV6E=
github.com/qri-io/dataset v0.2.1-0.20210304141850-a4a809d46350 h1:uXvx2/y+eqV5o77HLmw51S1aiskOvlq+b6WNTtXHAGk=
github.com/qri-io/dataset v0.2.1-0.20210304141850-a4a809d46350/go.mod h1:vlq9+Nu37koO3mrp25QGNOt68CLe2d2rAtB9cnDLV6E=
github.com/qri-io/dataset v0.2.1-0.20210312210644-ba8eaa336c8d h1:5KfPirdkABg/R3/8S9xwNZUDuwBj32BHrw/BgQ9DiXw=
github.com/qri-io/dataset v0.2.1-0.20210312210644-ba8eaa336c8d/go.mod h1:vlq9+Nu37koO3mrp25QGNOt68CLe2d2rAtB9cnDLV6E=
github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b h1:T8qEIv+qLi5mVWvSS329wJ+HbN7cfMwCWjRVzh/+upo=
github.com/qri-io/deepdiff v0.2.1-0.20200807143746-d02d9f531f5b/go.mod h1:NrL/b7YvexgpGb4HEO3Rlx5RrMLDfxuKDf/XDAq5ac0=
github.com/qri-io/didmod v0.0.0-20201123165422-8b2e224c993a h1:40BIa59lae2xZ7iieb3UU4/X57jZsWZ6QgqwdjDQhig=
Expand Down
8 changes: 3 additions & 5 deletions lib/datasets.go
Original file line number Diff line number Diff line change
Expand Up @@ -1437,11 +1437,9 @@ func (m *DatasetMethods) Validate(ctx context.Context, p *ValidateParams) (*Vali
// Schema is set to the provided filename if given, otherwise the dataset's schema
if schemaFlagType == "" {
st = ds.Structure
if ds.Structure == nil || ds.Structure.Schema == nil {
if err := base.InferStructure(ds); err != nil {
log.Debug("lib.Validate: InferStructure error: %w", err)
return nil, err
}
if err := detect.Structure(ds); err != nil {
log.Debug("lib.Validate: InferStructure error: %w", err)
return nil, err
}
} else {
data, err := ioutil.ReadFile(schemaFilename)
Expand Down
8 changes: 0 additions & 8 deletions lib/transform.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (

"github.com/qri-io/dataset"
"github.com/qri-io/dataset/preview"
"github.com/qri-io/qri/base"
"github.com/qri-io/qri/dsref"
"github.com/qri-io/qri/event"
"github.com/qri-io/qri/transform"
Expand Down Expand Up @@ -116,13 +115,6 @@ func (m *TransformMethods) Apply(ctx context.Context, p *ApplyParams) (*ApplyRes
}

if p.Wait {
if ds.Structure == nil {
if err := base.InferStructure(ds); err != nil {
log.Debugw("inferring structure", "err", err)
return nil, err
}
}

ds, err := preview.Create(ctx, ds)
if err != nil {
return nil, err
Expand Down
7 changes: 6 additions & 1 deletion transform/startf/ds/dataset.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"sync"

"github.com/qri-io/dataset"
"github.com/qri-io/dataset/detect"
"github.com/qri-io/dataset/dsio"
"github.com/qri-io/qfs"
"github.com/qri-io/starlib/util"
Expand Down Expand Up @@ -310,6 +311,11 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
d.write.SetBodyFile(qfs.NewMemfileBytes(fmt.Sprintf("body.%s", df), []byte(string(str))))
d.modBody = true
d.bodyCache = nil

if err := detect.Structure(d.write); err != nil {
return nil, err
}

return starlark.None, nil
}

Expand All @@ -324,7 +330,6 @@ func (d *Dataset) SetBody(thread *starlark.Thread, _ *starlark.Builtin, args sta
if err != nil {
return starlark.None, err
}

r := NewEntryReader(d.write.Structure, iter)
if err := dsio.Copy(r, w); err != nil {
return starlark.None, err
Expand Down
11 changes: 0 additions & 11 deletions transform/startf/exec_step.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import (
"github.com/qri-io/dataset"
"github.com/qri-io/dataset/preview"
"github.com/qri-io/qfs"
"github.com/qri-io/qri/base"
"github.com/qri-io/qri/dsref"
"github.com/qri-io/qri/event"
skyctx "github.com/qri-io/qri/transform/startf/context"
Expand Down Expand Up @@ -159,16 +158,6 @@ func (r *StepRunner) callTransformFunc(ctx context.Context, thread *starlark.Thr
return err
}

// TODO (b5) - this should happen in ds.set_body method call
if f := ds.BodyFile(); f != nil {
if ds.Structure == nil {
if err := base.InferStructure(ds); err != nil {
log.Debugw("inferring structure", "err", err)
return err
}
}
}

if r.eventsCh != nil {
pview, err := preview.Create(ctx, ds)
if err != nil {
Expand Down

0 comments on commit a8a3492

Please sign in to comment.