diff --git a/.gitignore b/.gitignore index 7a4cbdab0..9a2b33d60 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ cmd/boxo-migrate/boxo-migrate +vendor diff --git a/cmd/feather/main.go b/cmd/feather/main.go new file mode 100644 index 000000000..321a1b0e8 --- /dev/null +++ b/cmd/feather/main.go @@ -0,0 +1,57 @@ +package main + +import ( + "fmt" + "io" + "os" + + "github.com/ipfs/boxo/unixfs/feather" + "github.com/ipfs/go-cid" +) + +func main() { + err := mainRet() + if err != nil { + os.Stderr.WriteString(err.Error()) + os.Stderr.WriteString("\n") + os.Exit(1) + } + os.Exit(0) +} + +func parseArgs() (cid.Cid, error) { + if len(os.Args) != 2 { + return cid.Cid{}, fmt.Errorf("expected one argument") + } + + return cid.Decode(os.Args[1]) +} + +func mainRet() error { + c, err := parseArgs() + if err != nil { + return fmt.Errorf(`%w +Usage: +%s + +Example: +%s bafybeigdyrzt5sfp7udm7hu76uh7y26nf3efuylqabf3oclgtqy55fbzdi`, err, os.Args[0], os.Args[0]) + } + + f, err := feather.NewClient(feather.WithStaticGateway("http://localhost:8080/")) + if err != nil { + return fmt.Errorf("creating feather client: %w", err) + } + + r, err := f.DownloadFile(c) + if err != nil { + return fmt.Errorf("starting file download: %w", err) + } + defer r.Close() + + _, err = io.Copy(os.Stdout, r) + if err != nil { + return fmt.Errorf("downloading file: %w", err) + } + return nil +} diff --git a/examples/go.mod b/examples/go.mod index 495c2455a..a7ca0e643 100644 --- a/examples/go.mod +++ b/examples/go.mod @@ -5,7 +5,7 @@ go 1.20 require ( github.com/ipfs/boxo v0.8.0 github.com/ipfs/go-block-format v0.1.2 - github.com/ipfs/go-cid v0.4.1 + github.com/ipfs/go-cid v0.4.2-0.20230612091241-80d1e915f662 github.com/ipfs/go-datastore v0.6.0 github.com/ipld/go-car/v2 v2.10.2-0.20230622090957-499d0c909d33 github.com/ipld/go-ipld-prime v0.21.0 diff --git a/examples/go.sum b/examples/go.sum index eaec6f954..1dcfd3902 100644 --- a/examples/go.sum +++ b/examples/go.sum @@ -250,8 +250,8 @@ github.com/ipfs/go-blockservice v0.5.0 h1:B2mwhhhVQl2ntW2EIpaWPwSCxSuqr5fFA93Ms4 github.com/ipfs/go-cid v0.0.1/go.mod h1:GHWU/WuQdMPmIosc4Yn1bcCT7dSeX4lBafM7iqUPQvM= github.com/ipfs/go-cid v0.0.3/go.mod h1:GHWU/WuQdMPmIosc4Yn1bcCT7dSeX4lBafM7iqUPQvM= github.com/ipfs/go-cid v0.0.6/go.mod h1:6Ux9z5e+HpkQdckYoX1PG/6xqKspzlEIR5SDmgqgC/I= -github.com/ipfs/go-cid v0.4.1 h1:A/T3qGvxi4kpKWWcPC/PgbvDA2bjVLO7n4UeVwnbs/s= -github.com/ipfs/go-cid v0.4.1/go.mod h1:uQHwDeX4c6CtyrFwdqyhpNcxVewur1M7l7fNU7LKwZk= +github.com/ipfs/go-cid v0.4.2-0.20230612091241-80d1e915f662 h1:jWQ5yOEmR1Fvv6Rj8mEye4QTeGQoKKECMieju9z5kgA= +github.com/ipfs/go-cid v0.4.2-0.20230612091241-80d1e915f662/go.mod h1:4rtyA9XdBeZBapaRNJuTY9H+/6bG4URx/cVwjAzK6fw= github.com/ipfs/go-datastore v0.6.0 h1:JKyz+Gvz1QEZw0LsX1IBn+JFCJQH4SJVFtM4uWU0Myk= github.com/ipfs/go-datastore v0.6.0/go.mod h1:rt5M3nNbSO/8q1t4LNkLyUwRs8HupMeN/8O4Vn9YAT8= github.com/ipfs/go-detect-race v0.0.1 h1:qX/xay2W3E4Q1U7d9lNs1sU9nvguX0a7319XbyQ6cOk= diff --git a/go.mod b/go.mod index 2f4d53569..3c70505e3 100644 --- a/go.mod +++ b/go.mod @@ -17,7 +17,7 @@ require ( github.com/ipfs/bbloom v0.0.4 github.com/ipfs/go-bitfield v1.1.0 github.com/ipfs/go-block-format v0.1.2 - github.com/ipfs/go-cid v0.4.1 + github.com/ipfs/go-cid v0.4.2-0.20230612091241-80d1e915f662 github.com/ipfs/go-cidutil v0.1.0 github.com/ipfs/go-datastore v0.6.0 github.com/ipfs/go-detect-race v0.0.1 @@ -71,6 +71,7 @@ require ( go.opentelemetry.io/otel/trace v1.14.0 go.uber.org/multierr v1.11.0 go.uber.org/zap v1.25.0 + golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 golang.org/x/oauth2 v0.8.0 golang.org/x/sync v0.3.0 golang.org/x/sys v0.11.0 @@ -157,7 +158,6 @@ require ( go.uber.org/dig v1.17.0 // indirect go.uber.org/fx v1.20.0 // indirect golang.org/x/crypto v0.12.0 // indirect - golang.org/x/exp v0.0.0-20230817173708-d852ddb80c63 // indirect golang.org/x/mod v0.12.0 // indirect golang.org/x/net v0.14.0 // indirect golang.org/x/text v0.12.0 // indirect diff --git a/go.sum b/go.sum index 8beabe300..af504f6c9 100644 --- a/go.sum +++ b/go.sum @@ -252,8 +252,8 @@ github.com/ipfs/go-blockservice v0.5.0 h1:B2mwhhhVQl2ntW2EIpaWPwSCxSuqr5fFA93Ms4 github.com/ipfs/go-cid v0.0.1/go.mod h1:GHWU/WuQdMPmIosc4Yn1bcCT7dSeX4lBafM7iqUPQvM= github.com/ipfs/go-cid v0.0.3/go.mod h1:GHWU/WuQdMPmIosc4Yn1bcCT7dSeX4lBafM7iqUPQvM= github.com/ipfs/go-cid v0.0.6/go.mod h1:6Ux9z5e+HpkQdckYoX1PG/6xqKspzlEIR5SDmgqgC/I= -github.com/ipfs/go-cid v0.4.1 h1:A/T3qGvxi4kpKWWcPC/PgbvDA2bjVLO7n4UeVwnbs/s= -github.com/ipfs/go-cid v0.4.1/go.mod h1:uQHwDeX4c6CtyrFwdqyhpNcxVewur1M7l7fNU7LKwZk= +github.com/ipfs/go-cid v0.4.2-0.20230612091241-80d1e915f662 h1:jWQ5yOEmR1Fvv6Rj8mEye4QTeGQoKKECMieju9z5kgA= +github.com/ipfs/go-cid v0.4.2-0.20230612091241-80d1e915f662/go.mod h1:4rtyA9XdBeZBapaRNJuTY9H+/6bG4URx/cVwjAzK6fw= github.com/ipfs/go-cidutil v0.1.0 h1:RW5hO7Vcf16dplUU60Hs0AKDkQAVPVplr7lk97CFL+Q= github.com/ipfs/go-cidutil v0.1.0/go.mod h1:e7OEVBMIv9JaOxt9zaGEmAoSlXW9jdFZ5lP/0PwcfpA= github.com/ipfs/go-datastore v0.6.0 h1:JKyz+Gvz1QEZw0LsX1IBn+JFCJQH4SJVFtM4uWU0Myk= diff --git a/unixfs/feather/entry.go b/unixfs/feather/entry.go new file mode 100644 index 000000000..f87a44f39 --- /dev/null +++ b/unixfs/feather/entry.go @@ -0,0 +1,371 @@ +package feather + +import ( + "bufio" + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "math" + "net/http" + + "github.com/ipfs/boxo/unixfs" + "github.com/ipfs/boxo/verifcid" + blocks "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + mh "github.com/multiformats/go-multihash" + "golang.org/x/exp/slices" +) + +func cidStringTruncate(c cid.Cid) string { + cidStr := c.String() + if len(cidStr) > maxCidCharDisplay { + // please don't use non ASCII bases + cidStr = cidStr[:maxCidCharDisplay] + "..." + } + return cidStr +} + +const maxHeaderSize = 32 * 1024 * 1024 // 32MiB +const maxBlockSize = 2 * 1024 * 1024 // 2MiB +const maxCidSize = 4096 +const maxElementSize = maxCidSize + maxBlockSize + binary.MaxVarintLen64 +const maxCidCharDisplay = 512 + +type region struct { + c cid.Cid + size uint64 + rangeKnown bool +} + +type downloader struct { + buf bufio.Reader + state []region + curBlock []byte + readErr error + client *Client + remainingAttempts uint + stream io.Closer + hasRetries bool + gotOneBlock bool +} + +type Client struct { + httpClient *http.Client + hostname string + retries uint +} + +type Option func(*Client) error + +// WithHTTPClient allows to use a [http.Client] of your choice. +func WithHTTPClient(client *http.Client) Option { + return func(c *Client) error { + c.httpClient = client + return nil + } +} + +// WithRetries allows to specify how many times we should retry. +// [math.MaxUint] indicate infinite. +func WithRetries(n uint) Option { + return func(c *Client) error { + c.retries = n + return nil + } +} + +// WithStaticGateway sets a static gateway which will be used for all requests. +func WithStaticGateway(gateway string) Option { + if len(gateway) != 0 && gateway[len(gateway)-1] == '/' { + gateway = gateway[:len(gateway)-1] + } + gateway += "/ipfs/" + + return func(c *Client) error { + c.hostname = gateway + return nil + } +} + +var ErrNoAvailableDataSource = errors.New("no data source") + +func NewClient(opts ...Option) (*Client, error) { + c := &Client{ + httpClient: http.DefaultClient, + } + + for _, opt := range opts { + if err := opt(c); err != nil { + return nil, err + } + } + + if c.hostname == "" { + return nil, ErrNoAvailableDataSource + } + + return c, nil +} + +// DownloadFile takes in a [cid.Cid] and return an [io.ReadCloser] which streams the deserialized file. +// You MUST always call the Close method when you are done using it else it would leak resources. +func (client *Client) DownloadFile(c cid.Cid) (io.ReadCloser, error) { + attempts := client.retries + if attempts != math.MaxUint { + attempts++ + } + d := &downloader{ + client: client, + state: []region{{c: normalizeCidv0(c)}}, + buf: *bufio.NewReaderSize(nil, maxElementSize*2), + remainingAttempts: attempts, + hasRetries: client.retries != 0, + } + + return d, nil +} + +func (d *downloader) startStream(todo region) error { + d.gotOneBlock = false + req, err := http.NewRequest("GET", d.client.hostname+todo.c.String()+"?dag-scope=entity", bytes.NewReader(nil)) + if err != nil { + return err + } + req.Header.Add("Accept", "application/vnd.ipld.car;dups=y;order=dfs;version=1") + + resp, err := d.client.httpClient.Do(req) + if err != nil { + return err + } + var good bool + defer func() { + if !good { + d.Close() + } + }() + + d.stream = resp.Body + d.buf.Reset(resp.Body) + + headerSize, err := binary.ReadUvarint(&d.buf) + if err != nil { + return err + } + if headerSize > maxHeaderSize { + return fmt.Errorf("header is to big at %d instead of %d", headerSize, maxHeaderSize) + } + + _, err = d.buf.Discard(int(headerSize)) + if err != nil { + return err + } + + good = true + + return nil +} + +func loadCidFromBytes(cidBytes []byte) (cid.Cid, error) { + if len(cidBytes) == 0 { + return cid.Cid{}, fmt.Errorf("missing CID") + } + if len(cidBytes) > maxCidSize { + return cid.Cid{}, fmt.Errorf("CID is too big, %d max allowed %d", len(cidBytes), maxCidSize) + } + + c, err := cid.Cast(cidBytes) + if err != nil { + return cid.Cid{}, fmt.Errorf("malphormed CID: %w", err) + } + + return c, nil +} + +func (d *downloader) Read(b []byte) (_ int, err error) { + if d.readErr != nil { + return 0, d.readErr + } + defer func() { + d.readErr = err + }() + for len(d.curBlock) == 0 { + // have to fill more data in the buffer + if len(d.state) == 0 { + // no more data remaining + return 0, io.EOF + } + + // pop current item from the DFS stack + last := len(d.state) - 1 + todo := d.state[last] + d.state = d.state[:last] + + var data []byte + c := todo.c + + pref := c.Prefix() + switch pref.MhType { + case mh.IDENTITY: + data = c.Hash() + data = data[len(data)-pref.MhLength:] // extract digest + default: + data, err = d.next(todo) + if err != nil { + return 0, err + } + } + + b, err := blocks.NewBlockWithCid(data, c) + if err != nil { + return 0, err + } + node, err := unixfs.Parse(b) + if err != nil { + return 0, err + } + + switch n := node.(type) { + case unixfs.File[string, string]: + d.curBlock = n.Data + + filesize := uint64(len(n.Data)) + if childs := n.Childrens; len(childs) != 0 { + regions := slices.Grow(d.state, len(childs)) + for i := len(childs); i > 0; { + i-- + regions = append(regions, region{ + c: normalizeCidv0(childs[i].Cid), + size: childs[i].FileSize, + rangeKnown: true, + }) + filesize += childs[i].FileSize + } + d.state = regions + } + + if todo.rangeKnown && todo.size != filesize { + return 0, fmt.Errorf("inconsistent filesize for %s, expected %d; got %d", cidStringTruncate(c), todo.size, filesize) + } + default: + return 0, fmt.Errorf("unknown unixfs type, got %T for %s", node, cidStringTruncate(c)) + } + } + + n := copy(b, d.curBlock) + d.curBlock = d.curBlock[n:] + + return n, nil +} + +// next download the next block, it also handles performing retries if needed. +// The data return is hash correct. +func (d *downloader) next(todo region) ([]byte, error) { + c := todo.c + if err := verifcid.ValidateCid(verifcid.DefaultAllowlist, c); err != nil { + return nil, fmt.Errorf("cid %s don't pass safe test: %w", cidStringTruncate(c), err) + } + var errStartStream, errRead error + for { + if d.stream == nil { + if !d.hasRetries && errRead == io.EOF { + return nil, fmt.Errorf("gateway terminated too early, still want: %s", cidStringTruncate(c)) + } + if attempts := d.remainingAttempts; attempts != math.MaxUint { + if attempts == 0 { + return nil, fmt.Errorf("could not download next block: %w", errors.Join(errRead, errStartStream)) + } + d.remainingAttempts = attempts - 1 + } + errStartStream = d.startStream(todo) + } + var data []byte + data, errRead = d.readBlockFromStream(c) + if errRead == nil { + return data, nil + } + d.stream.Close() + d.stream = nil + } +} + +// readBlockFromStream must perform hash verification on the input. +// The slice returned only has to be valid between two readBlockFromStream and Close calls. +// Implementations should reuse buffers to avoid allocations. +func (d *downloader) readBlockFromStream(expectedCid cid.Cid) (_ []byte, rErr error) { + itemLenU, err := binary.ReadUvarint(&d.buf) + switch err { + case io.EOF: + return nil, err + case nil: + break + default: + return nil, fmt.Errorf("reading next block length: %w", err) + } + if itemLenU > maxBlockSize+maxCidSize { + return nil, fmt.Errorf("item size (%d) for %s exceed maxBlockSize+maxCidSize (%d)", itemLenU, cidStringTruncate(expectedCid), maxBlockSize+maxCidSize) + } + itemLen := int(itemLenU) + + cidLen, cidFound, err := cid.CidFromReader(&d.buf) + if err != nil { + err = eofWouldBeUnexpected(err) + return nil, fmt.Errorf("trying to read %s failed to read cid: %w", cidStringTruncate(expectedCid), err) + } + if cidLen > maxCidSize { + return nil, fmt.Errorf("cidFound for %s is too big at %d bytes", cidStringTruncate(expectedCid), cidLen) + } + cidFound = normalizeCidv0(cidFound) + if cidFound != expectedCid { + return nil, fmt.Errorf("downloading %s but got %s instead", cidStringTruncate(expectedCid), cidStringTruncate(cidFound)) + } + + blockSize := itemLen - cidLen + if blockSize > maxBlockSize { + return nil, fmt.Errorf("block %s is too big (%d) max %d", cidStringTruncate(expectedCid), blockSize, maxBlockSize) + } + data, err := d.buf.Peek(blockSize) + if err != nil { + err = eofWouldBeUnexpected(err) + return nil, fmt.Errorf("peeking at block data for %s verification: %w", cidStringTruncate(expectedCid), err) + } + _, err = d.buf.Discard(len(data)) + if err != nil { + return nil, fmt.Errorf("critical: Discard is supposed to always succeed as long as we don't read less than buffered: %w", err) + } + + cidGot, err := expectedCid.Prefix().Sum(data) + if err != nil { + return nil, fmt.Errorf("hashing data for %s: %w", cidStringTruncate(expectedCid), err) + } + cidGot = normalizeCidv0(cidGot) + + if cidGot != expectedCid { + return nil, fmt.Errorf("data integrity failed, expected %s; got %s", cidStringTruncate(expectedCid), cidStringTruncate(cidGot)) + } + + return data, nil +} + +func eofWouldBeUnexpected(err error) error { + if err == io.EOF { + return io.ErrUnexpectedEOF + } + return err +} + +func (d *downloader) Close() error { + if s := d.stream; s != nil { + d.stream = nil + return s.Close() + } + return nil +} + +func normalizeCidv0(c cid.Cid) cid.Cid { + if c.Version() == 0 { + return cid.NewCidV1(cid.DagProtobuf, c.Hash()) + } + return c +} diff --git a/unixfs/feather/feather_test.go b/unixfs/feather/feather_test.go new file mode 100644 index 000000000..052420cf2 --- /dev/null +++ b/unixfs/feather/feather_test.go @@ -0,0 +1,82 @@ +package feather_test + +import ( + "bytes" + "crypto/sha256" + "encoding/hex" + "io" + "net/http/httptest" + "os" + "path/filepath" + "testing" + + "github.com/ipfs/boxo/blockservice" + "github.com/ipfs/boxo/exchange/offline" + "github.com/ipfs/boxo/gateway" + "github.com/ipfs/boxo/unixfs/feather" + "github.com/ipfs/go-cid" + carblockstore "github.com/ipld/go-car/v2/blockstore" + "github.com/stretchr/testify/assert" +) + +func newGateway(t *testing.T, fixture string) (*httptest.Server, cid.Cid) { + t.Helper() + + r, err := os.Open(filepath.Join("./testdata", fixture)) + assert.NoError(t, err) + + blockStore, err := carblockstore.NewReadOnly(r, nil) + assert.NoError(t, err) + + t.Cleanup(func() { + blockStore.Close() + r.Close() + }) + + cids, err := blockStore.Roots() + assert.NoError(t, err) + assert.Len(t, cids, 1) + + blockService := blockservice.New(blockStore, offline.Exchange(blockStore)) + + backend, err := gateway.NewBlocksBackend(blockService) + assert.NoError(t, err) + + handler := gateway.NewHandler(gateway.Config{}, backend) + + ts := httptest.NewServer(handler) + t.Cleanup(func() { ts.Close() }) + + return ts, cids[0] +} + +func newFeather(t *testing.T, fixture string) (*feather.Client, cid.Cid) { + t.Helper() + + gw, cid := newGateway(t, fixture) + f, err := feather.NewClient(feather.WithHTTPClient(gw.Client()), feather.WithStaticGateway(gw.URL)) + assert.NoError(t, err) + return f, cid +} + +func mustParseHex(s string) []byte { + v, err := hex.DecodeString(s) + if err != nil { + panic(err) + } + return v +} + +func TestFileWithManyRawLeaves(t *testing.T) { + f, root := newFeather(t, "file-with-many-raw-leaves.car") + file, err := f.DownloadFile(root) + assert.NoError(t, err) + defer func() { assert.NoError(t, file.Close()) }() + h := sha256.New() + _, err = io.Copy(h, file) + assert.NoError(t, err) + + if !bytes.Equal(h.Sum(nil), mustParseHex("5e38d403b548e38fe350410347f6310b757203b19be6cd5323ec3ca56404b387")) { + t.Error("decoded content does not match expected") + } +} diff --git a/unixfs/feather/testdata/file-with-many-raw-leaves.car b/unixfs/feather/testdata/file-with-many-raw-leaves.car new file mode 100644 index 000000000..b879d379b Binary files /dev/null and b/unixfs/feather/testdata/file-with-many-raw-leaves.car differ diff --git a/unixfs/json.go b/unixfs/json.go new file mode 100644 index 000000000..96921fe2b --- /dev/null +++ b/unixfs/json.go @@ -0,0 +1,103 @@ +package unixfs + +import ( + "encoding" + "errors" + "fmt" + "strconv" +) + +var _ fmt.Stringer = AliasableString(nil) +var _ encoding.TextMarshaler = AliasableString(nil) +var _ encoding.TextUnmarshaler = (*AliasableString)(nil) + +// AliasableString is a byte slice that have string json sementics, allowing to skip allocations while decoding. +type AliasableString []byte + +func (s AliasableString) String() string { + return string(s) +} + +func (s AliasableString) MarshalText() ([]byte, error) { + return s, nil +} + +func (s *AliasableString) UnmarshalText(b []byte) error { + // Sadly we must copy. + // UnmarshalText must copy the text if it wishes to retain the text after returning. + new := make([]byte, len(b)) + copy(new, b) + *s = new + return nil +} + +var _ fmt.Stringer = Type(0) +var _ encoding.TextMarshaler = Type(0) +var _ encoding.TextUnmarshaler = (*Type)(nil) + +// Type is an alternative to [Node] which allows for zero-allocation code. +type Type uint8 + +func (t Type) String() string { + switch t { + case TError: + return "Error" + case TFile: + return "File" + case TDirectory: + return "Directory" + case TSymlink: + return "Symlink" + default: + return "error unknown type: " + strconv.FormatUint(uint64(t), 10) + } +} + +var ( + textError = []byte("Error") + textFile = []byte("File") + textDirectory = []byte("Directory") + textSymlink = []byte("Symlink") +) + +func (t Type) MarshalText() ([]byte, error) { + switch t { + case TError: + return textError, nil + case TFile: + return textFile, nil + case TDirectory: + return textDirectory, nil + case TSymlink: + return textSymlink, nil + default: + return nil, errors.New(t.String()) + } +} + +func (t *Type) UnmarshalText(b []byte) error { + switch string(b) { + case "Error": + *t = TError + return nil + case "File": + *t = TFile + return nil + case "Directory": + *t = TDirectory + return nil + case "Symlink": + *t = TSymlink + return nil + default: + return fmt.Errorf("unknown unixfs type: %q", string(b)) + } +} + +const ( + // TError is returned when something wrong happend. + TError Type = iota + TFile + TDirectory + TSymlink +) diff --git a/unixfs/pb.go b/unixfs/pb.go new file mode 100644 index 000000000..a90e12955 --- /dev/null +++ b/unixfs/pb.go @@ -0,0 +1,415 @@ +// @Jorropo: The reason why I picked the solution to write a snowflake protobuf +// decoder here is because I couldn't find a zero allocation protobuf decoder generator. +// I do not count pooling or arenas as zero allocation btw. +// If you are reading this text trying to add more fields and this is too painfull +// to deal with feel free to remove this code and replace it with an allocation +// codegen decoder. Ping me too if I'm still around I might revert your changes +// and bring back the allocation free decoder but with the new feature. +package unixfs + +import ( + "errors" + "fmt" + + "github.com/ipfs/go-cid" + "golang.org/x/exp/slices" + "google.golang.org/protobuf/encoding/protowire" +) + +const ( + _ = iota + pbDirectory + pbFile + pbMetadata + pbSymlink + pbHAMTShard +) + +// Reference: +// +// message Data { +// enum DataType { +// Raw = 0; +// Directory = 1; +// File = 2; +// Metadata = 3; +// Symlink = 4; +// HAMTShard = 5; +// } +// +// required DataType Type = 1; +// optional bytes Data = 2; +// optional uint64 filesize = 3; +// repeated uint64 blocksizes = 4; +// +// optional uint64 hashType = 5; +// optional uint64 fanout = 6; +// } +// +// message Metadata { +// optional string MimeType = 1; +// } +// +// message PBLink { +// // binary CID (with no multibase prefix) of the target object +// optional bytes Hash = 1; +// +// // UTF-8 string name +// optional string Name = 2; +// +// // cumulative size of target object +// optional uint64 Tsize = 3; +// } +// +// message PBNode { +// // refs to other objects +// repeated PBLink Links = 2; +// +// // Unixfs message inside the user opaque data +// optional Data Data = 1; +// } + +func parsePB[Self, Children cid.Storage]( + fileChildrens []FileEntry[Children], + directoryChildrens []DirectoryEntry[Children], + inCid cid.GenericCid[Self], origData []byte, +) (dataType uint64, _ []FileEntry[Children], fileLinks, blocksizes uint, _ []DirectoryEntry[Children], content []byte, selfTSize uint64, _ error) { + selfTSize = 1 + data := origData + + moveZeroNamedDirectoryEntriesToDirectoryChildrens := func(extra int) { + // some zero named children were confused for file entries before, move them here + // FIXME: is an empty name a valid file name in a directory ? + directoryChildrens = slices.Grow(directoryChildrens, len(fileChildrens)+extra) + for _, v := range fileChildrens { + directoryChildrens = append(directoryChildrens, DirectoryEntry[Children]{ + Entry: Entry[Children]{Cid: v.Cid, tSize: v.tSize}, + Name: AliasableString{}, + }) + } + + fileChildrens = nil + } + + for len(data) != 0 { // iterate at the root level of the message + outerNumber, t, l := protowire.ConsumeTag(data) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + data = data[l:] + switch outerNumber { + case 1, 2: + // optional Data Data = 1; + // repeated PBLink Links = 2; + var group bool + var mData []byte + switch t { + case protowire.StartGroupType: + // boundry delimited message + group = true + mData = data + case protowire.BytesType: + // length prefixed message + mData, l = protowire.ConsumeBytes(data) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + data = data[l:] // we just extracted the message so walk over it completely + default: + var err error + if outerNumber == 1 { + err = fmt.Errorf("unknown type for Data field %v", t) + } else { + err = fmt.Errorf("unknown type for Links field %v", t) + } + return 0, nil, 0, 0, nil, nil, 0, err + } + + var c cid.GenericCid[Children] + var name []byte + var tSize uint64 // will be offset by +1, zero means not found + + for len(mData) != 0 { + n, t, l := protowire.ConsumeTag(mData) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + mData = mData[l:] + + if t == protowire.EndGroupType { + // if we find an EGROUP here it must be ours since pbHandleUnknownField skip over groups. + break + } + + if outerNumber == 1 { + // optional Data Data = 1; + switch n { + case 1: + // required DataType Type = 1; + var err error + mData, dataType, err = pbDecodeNumber(t, mData) + if err != nil { + return 0, nil, 0, 0, nil, nil, 0, err + } + // due to how "Last One Wins" we can't do anything meaningfull without fully decoding the message first. + + case 2: + // optional bytes Data = 2; + switch t { + case protowire.BytesType: + content, l = protowire.ConsumeBytes(mData) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + mData = mData[l:] + + default: + return 0, nil, 0, 0, nil, nil, 0, fmt.Errorf("unknown type for Data.Data field %v", t) + } + + case 4: + // repeated uint64 blocksizes = 4; + addBlocksize := func(blocksize uint64) error { + if len(directoryChildrens) != 0 { + return errors.New("invalid unixfs node, mixed use of blocksizes and named links") + } + + if uint(len(fileChildrens)) > blocksizes { + // we have discovered more links than blocksizes at this point, play catchup + fileChildrens[blocksizes].FileSize = blocksize + } else { + // we have discovered more blocksizes than links at this point, add new entries + fileChildrens = append(fileChildrens, FileEntry[Children]{FileSize: blocksize}) + } + blocksizes++ + return nil + } + + switch t { + // FIXME: this condition accepts Fixed numbers, is that valid ? + // I mean it works but do other protobuf parsers do this ? + case protowire.VarintType, protowire.Fixed64Type, protowire.Fixed32Type: + var blocksize uint64 + var err error + mData, blocksize, err = pbDecodeNumber(t, mData) + if err != nil { + return 0, nil, 0, 0, nil, nil, 0, err + } + addBlocksize(blocksize) + + case protowire.BytesType: + // packed representation + packed, l := protowire.ConsumeBytes(mData) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + mData = mData[l:] + + for len(packed) != 0 { + blocksize, l := protowire.ConsumeVarint(packed) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + packed = packed[l:] + + addBlocksize(blocksize) + } + + default: + return 0, nil, 0, 0, nil, nil, 0, fmt.Errorf("unknown type for Data.Blocksizes field %v", t) + } + + default: + var err error + mData, err = pbHandleUnknownField(t, mData) + if err != nil { + return 0, nil, 0, 0, nil, nil, 0, err + } + } + } else { + // repeated PBLink Links = 2; + switch n { + case 1: + // optional bytes Hash = 1; + switch t { + case protowire.BytesType: + cBytes, l := protowire.ConsumeBytes(mData) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + mData = mData[l:] + + var err error + c, err = cid.CastGeneric[Children](cBytes) + if err != nil { + return 0, nil, 0, 0, nil, nil, 0, fmt.Errorf("failed to decode cid: %w", err) + } + default: + return 0, nil, 0, 0, nil, nil, 0, fmt.Errorf("unknown type for Links.Hash field %v", t) + } + + case 2: + // optional string Name = 2; + switch t { + case protowire.BytesType: + name, l = protowire.ConsumeBytes(mData) + if l < 0 { + return 0, nil, 0, 0, nil, nil, 0, protowire.ParseError(l) + } + mData = mData[l:] + + default: + return 0, nil, 0, 0, nil, nil, 0, fmt.Errorf("unknown type for Links.Name field %v", t) + } + + case 3: + // optional uint64 Tsize = 3; + var err error + mData, tSize, err = pbDecodeNumber(t, mData) + if err != nil { + return 0, nil, 0, 0, nil, nil, 0, err + } + if selfTSize != 0 { + if tSize == 0 { + selfTSize = 0 + } else { + selfTSize += tSize + } + } + tSize++ + + default: + var err error + mData, err = pbHandleUnknownField(t, mData) + if err != nil { + return 0, nil, 0, 0, nil, nil, 0, err + } + } + } + } + + if outerNumber == 2 { + // repeated PBLink Links = 2; + if !c.Defined() { + return 0, nil, 0, 0, nil, nil, 0, errors.New("link is missing CID") + } + + // note we accept present but empty name entries on files because some historic + // encoder emited a whole bunch of them in the wild + if len(name) != 0 || len(directoryChildrens) != 0 { + // Directory entry + if blocksizes != 0 { + return 0, nil, 0, 0, nil, nil, 0, errors.New("mixed use of blocksizes and named links") + } + + if len(fileChildrens) != 0 { + moveZeroNamedDirectoryEntriesToDirectoryChildrens(1) + } + + directoryChildrens = append(directoryChildrens, DirectoryEntry[Children]{ + Entry: Entry[Children]{Cid: c, tSize: tSize}, + Name: AliasableString(name), + }) + } else { + // File entry + if uint(len(fileChildrens)) > fileLinks { + // we have discovered more blocksizes than links at this point, play catchup + fileChildrens[fileLinks].Cid = c + fileChildrens[fileLinks].tSize = tSize + } else { + // we have discovered more links than blocksizes at this point, add new entries + fileChildrens = append(fileChildrens, FileEntry[Children]{Entry: Entry[Children]{Cid: c, tSize: tSize}}) + } + fileLinks++ + } + } + + if group { + // Now that we have found the end restore data. + data = mData + } + + default: + var err error + data, err = pbHandleUnknownField(t, data) + if err != nil { + return 0, nil, 0, 0, nil, nil, 0, err + } + } + } + + return dataType, fileChildrens, fileLinks, blocksizes, directoryChildrens, content, selfTSize, nil +} + +// pbHandleUnknownField must be called right after the tag, it will handle +// skipping uneeded values if needed. +func pbHandleUnknownField(t protowire.Type, data []byte) ([]byte, error) { + if len(data) == 0 { + return nil, errors.New("no field to consume") + } + + var groupStack uint + for { + var l int + switch t { + case protowire.BytesType: + _, l = protowire.ConsumeBytes(data) + case protowire.VarintType: + _, l = protowire.ConsumeVarint(data) + case protowire.Fixed64Type: + _, l = protowire.ConsumeFixed64(data) + case protowire.Fixed32Type: + _, l = protowire.ConsumeFixed32(data) + case protowire.StartGroupType: + groupStack++ + goto next + case protowire.EndGroupType: + if groupStack == 0 { + return nil, errors.New("unmatched end group") + } + groupStack-- + goto next + default: + return nil, fmt.Errorf("unknown protobuf type: %v", t) + } + if l < 0 { + return nil, protowire.ParseError(l) + } + data = data[l:] + + next: + if groupStack == 0 { + break + } + + _, t, l = protowire.ConsumeTag(data) + if l < 0 { + return nil, protowire.ParseError(l) + } + data = data[l:] + } + return data, nil +} + +// pbDecodeNumber will decode a uint64 as best as it can. +// It must be called right after the tag. +func pbDecodeNumber(typ protowire.Type, data []byte) ([]byte, uint64, error) { + var v uint64 + var l int + switch typ { + case protowire.VarintType: + v, l = protowire.ConsumeVarint(data) + case protowire.Fixed64Type: + v, l = protowire.ConsumeFixed64(data) + case protowire.Fixed32Type: + var v32 uint32 + v32, l = protowire.ConsumeFixed32(data) + v = uint64(v32) + default: + return nil, 0, fmt.Errorf("unexpected type for number %v", typ) + } + if l < 0 { + return nil, 0, protowire.ParseError(l) + } + return data[l:], v, nil +} diff --git a/unixfs/pb_test.go b/unixfs/pb_test.go new file mode 100644 index 000000000..f1f6e0802 --- /dev/null +++ b/unixfs/pb_test.go @@ -0,0 +1,45 @@ +package unixfs + +import ( + "encoding/base64" + "testing" + + "github.com/ipfs/go-cid" + mh "github.com/multiformats/go-multihash" +) + +const someDagPBBlock = `EisKIhIgVuq+9ViNicx1O8bIsb978a8u1uoTjm4taEeNW7gcB+cSABiu1OAVEioKIhIg7XyJKU3lrLCYFLKmcNTtKc82BUBCi5ePAeAqz2M1pWYSABirmGcKEAgCGPGUxxYggIDgFSDxlGc=` + +func BenchmarkPB(b *testing.B) { + data, err := base64.StdEncoding.DecodeString(someDagPBBlock) + if err != nil { + b.Fatal(err) + } + mh, err := mh.Sum(data, mh.SHA2_256, -1) + if err != nil { + b.Fatal() + } + c := cid.NewCidV0Generic[[]byte](mh) + + b.ResetTimer() + var out []FileEntry[[]byte] + for i := b.N; i != 0; i-- { + _, out, _, _, _, _, _, _ = parsePB(out[:0], nil, c, data) + } +} + +func FuzzPB(f *testing.F) { + data, err := base64.StdEncoding.DecodeString(someDagPBBlock) + if err != nil { + f.Fatal(err) + } + f.Add(data) + f.Fuzz(func(_ *testing.T, b []byte) { + if len(b) > 2*1024*1024 { + // Assume a block limit is inplace. + return + } + var zero cid.GenericCid[[]byte] + parsePB[[]byte, []byte](nil, nil, zero, b) + }) +} diff --git a/unixfs/unixfs.go b/unixfs/unixfs.go new file mode 100644 index 000000000..69da77ee7 --- /dev/null +++ b/unixfs/unixfs.go @@ -0,0 +1,190 @@ +// unixfs provides type safe low level premitives to read and write unixfs blocks. +// It handles encoding, decoding and validation but does not handle any +// cross-block linking, this is provided by various opiniated implementations +// available in sub packages or as an exercise to the consumer. +// +// This package is Data-Oriented, the main way this impact tradeoffs is that +// state is moved to control flow when possible and allocations are hammered to +// a minimum for example by returning pointers aliased to the input. +package unixfs + +import ( + "errors" + "fmt" + + blocks "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + "github.com/multiformats/go-multicodec" +) + +// Entry is a basic unit block. +type Entry[S cid.Storage] struct { + Cid cid.GenericCid[S] + // tSize encode the comulative size of the DAG. + // the zero value indicates tsize is missing. + tSize uint64 +} + +func (e Entry[S]) TSize() (tsize uint64, ok bool) { + if e.tSize == 0 { + return 0, false + } + + return e.tSize - 1, true +} + +func (e Entry[S]) Untyped() Entry[S] { + return e +} + +var _ Node = File[string, string]{} + +type File[Self, Children cid.Storage] struct { + //lint:ignore U1000 this is a badge patern + badge + Entry[Self] + Data []byte + + Childrens []FileEntry[Children] +} + +func FileEntryWithTSize[S cid.Storage](c cid.GenericCid[S], fileSize, tSize uint64) FileEntry[S] { + return FileEntry[S]{Entry: Entry[S]{Cid: c, tSize: tSize + 1}, FileSize: fileSize} +} + +type FileEntry[S cid.Storage] struct { + Entry[S] + // FileSize is the logical size of the file at this location once decoded. + FileSize uint64 +} + +var _ Node = Directory[string, string]{} + +type Directory[Self, Children cid.Storage] struct { + //lint:ignore U1000 this is a badge patern + badge + Entry[Self] + Childrens []DirectoryEntry[Children] +} + +type DirectoryEntry[S cid.Storage] struct { + Entry[S] + Name AliasableString +} + +var _ Node = Symlink[string]{} + +type Symlink[S cid.Storage] struct { + //lint:ignore U1000 this is a badge patern + badge + Entry[S] + Value []byte +} + +// badge authorize a type to be a [Node]. +// If you add a new type using this you need to update [Parse]. +type badge struct{} + +//lint:ignore U1000 this is a badge patern +func (badge) nodeBadge() { + panic("badge was called even tho it only exists as a way to trick the type checker") +} + +// Node is an interface that can exclusively be a [File], [Directory] or [Symlink]. We might add more in the future. +// You MUST NOT embed this interface, it's only purpose is to provide type safe enums. +type Node interface { + // Untyped returns the untyped [Entry] for that value stripped of all type related information. + Untyped() Entry[string] + // nodeBadge must never be called it's just here to trick the type checker. + nodeBadge() +} + +// Parse it provides a type safe solution to Decode using the badged interface [Node]. +// [File.Data], [DirectoryEntry.Name] and [Symlink.Value] values are aliased to b.RawData(). +// The data argument MUST hash to cid, this wont check the validaty of the hash. +// It assumes the size of the block is limited and reasonable. +func Parse(b blocks.Block) (Node, error) { + switch t, f, d, s, err := ParseAppend[string, string](nil, nil, b.Cid(), b.RawData()); t { + case TError: + return nil, err + case TFile: + return f, nil + case TDirectory: + return d, nil + case TSymlink: + return s, nil + default: + return nil, errors.New("unknown node type in Parse (Should never happen please open an issue !): " + t.String()) + } +} + +// ParseAppend is like [Parse] except it is turbo charged to avoid allocation. +// It returns a [Type] which indicates which of the struct is correct, all of this is passed on the stack or registers. +// Assuming the capacity in the slices are big enough and err == nil it does not allocate anything, arguments do not escape. +// [File.Data], [DirectoryEntry.Name] and [Symlink.Value] values are aliased to b.RawData(). +// It also accepts the input slices which will be append to and returned in structs to avoid allocations. +// It is only ever gonna clobber the slice related to the type of data decoded. +// It only ever clobber extra capacity within the slices, it may do so in the case of an error. +// The data argument MUST hash to cid, this wont check the validaty of the hash. +// It assumes the size of the block is limited and reasonable. +func ParseAppend[Self, Children cid.Storage]( + fileChildrens []FileEntry[Children], + directoryChildrens []DirectoryEntry[Children], + inCid cid.GenericCid[Self], data []byte, +) (t Type, f File[Self, Children], d Directory[Self, Children], s Symlink[Self], err error) { + // Avoid clobbering the used part of the slice. + fileChildrens = fileChildrens[len(fileChildrens):] + directoryChildrens = directoryChildrens[len(directoryChildrens):] + + var dataType, selfTSize uint64 + var fileLinks, blocksizes uint + var content []byte + + pref := inCid.Prefix() + switch c := multicodec.Code(pref.Codec); c { + case multicodec.Raw: + t = TFile + f = File[Self, Children]{ + Entry: Entry[Self]{ + Cid: inCid, + tSize: uint64(len(data)) + 1, + }, + Data: data, + Childrens: fileChildrens, + } + return + case multicodec.DagPb: + dataType, fileChildrens, fileLinks, blocksizes, directoryChildrens, content, selfTSize, err = parsePB(fileChildrens, directoryChildrens, inCid, data) + default: + err = errors.New("unsupported codec: " + c.String()) + return + } + if err != nil { + err = fmt.Errorf("failed to parse: %w", err) + return + } + + if fileLinks != blocksizes { + err = fmt.Errorf("unmatched links (%d) and blocksizes (%d) sisterlists", uint(len(fileChildrens)), blocksizes) + return + } + + switch dataType { + case pbFile: + if len(directoryChildrens) != 0 { + err = errors.New("named links in file") + return + } + + return TFile, File[Self, Children]{ + Entry: Entry[Self]{Cid: inCid, tSize: selfTSize + uint64(len(data))}, + Data: content, + Childrens: fileChildrens, + }, Directory[Self, Children]{}, Symlink[Self]{}, nil + + // TODO: directory and symlink + default: + err = fmt.Errorf("unknown node type: %d", dataType) + return + } +} diff --git a/unixfs/unixfs_test.go b/unixfs/unixfs_test.go new file mode 100644 index 000000000..b6495f432 --- /dev/null +++ b/unixfs/unixfs_test.go @@ -0,0 +1,150 @@ +package unixfs_test + +import ( + "bytes" + "encoding/base64" + "testing" + + . "github.com/ipfs/boxo/unixfs" + blocks "github.com/ipfs/go-block-format" + "github.com/ipfs/go-cid" + mh "github.com/multiformats/go-multihash" + "golang.org/x/exp/slices" +) + +func TestRaw(t *testing.T) { + t.Parallel() + data := []byte("👋🌍️") + mh, err := mh.Sum(data, mh.BLAKE3, -1) + if err != nil { + t.Fatal() + } + c := cid.NewCidV1(cid.Raw, mh) + + validate := func(t *testing.T, f File[string, string]) { + if !bytes.Equal(data, f.Data) { + t.Errorf("expected %v got %v", data, f.Data) + } + if l := len(f.Childrens); l != 0 { + t.Errorf("expected 0 Childrens got %d", l) + } + tsize, ok := f.TSize() + if !ok { + t.Error("expected to find TSize but didn't") + } else if l := uint64(len(data)); tsize != l { + t.Errorf("expected tsize %d got %d", l, tsize) + } + if f.Cid != c { + t.Errorf("expected cid %s got %s", c, f.Cid) + } + } + + t.Run("Parse", func(t *testing.T) { + t.Parallel() + b, err := blocks.NewBlockWithCid(data, c) + if err != nil { + t.Fatal(err) + } + a, err := Parse(b) + if err != nil { + t.Fatal(err) + } + f, ok := a.(File[string, string]) + if !ok { + t.Fatalf("expected File got %T", a) + } + validate(t, f) + }) + t.Run("ParseAppend", func(t *testing.T) { + t.Parallel() + var someArr [2]FileEntry[string] + typ, f, _, _, err := ParseAppend(someArr[:1], nil, c, data) + if err != nil { + t.Fatal(err) + } + if typ != TFile { + t.Fatalf("expected %v got %v", TFile, typ) + } + validate(t, f) + // Check someArr[1] to ensure it doesn't touch already existing entries before len. + if &someArr[1] != &f.Childrens[:1][0] { + t.Fatal("expected pointers to still be aliased but they are not") + } + }) +} + +func TestFilePB(t *testing.T) { + t.Parallel() + data, err := base64.StdEncoding.DecodeString(`EisKIhIgVuq+9ViNicx1O8bIsb978a8u1uoTjm4taEeNW7gcB+cSABiu1OAVEioKIhIg7XyJKU3lrLCYFLKmcNTtKc82BUBCi5ePAeAqz2M1pWYSABirmGcKEAgCGPGUxxYggIDgFSDxlGc=`) + if err != nil { + t.Fatal(err) + } + mh, err := mh.Sum(data, mh.SHA2_256, -1) + if err != nil { + t.Fatal() + } + c := cid.NewCidV0(mh) + + const firstChildrenTSize = 45623854 + const secondChildrenTSize = 1690667 + expectedChildrens := [2]FileEntry[string]{ + FileEntryWithTSize(cid.MustParse("QmUBwP7RczPWbJSCpR4BygzvTNbJ2sfjt5yuRphSVYaJar"), 45613056, firstChildrenTSize), + FileEntryWithTSize(cid.MustParse("QmeKhUSkRVDFbxssXpnb15UQf25YdWN9Ck3rjfZA3tvD8h"), 1690225, secondChildrenTSize), + } + + validate := func(t *testing.T, f File[string, string]) { + if f.Cid != c { + t.Errorf("expected %v cid got %v", c, f.Cid) + } + + if len(f.Data) != 0 { + t.Errorf("got unexpected data %q", f.Data) + } + + tSize, ok := f.TSize() + if !ok { + t.Error("missing TSize") + } else if et := uint64(len(data)) + firstChildrenTSize + secondChildrenTSize; tSize != et { + t.Errorf("tSize expected %d got %d", et, tSize) + } + + if len(f.Childrens) != 2 { + t.Errorf("expected 2 childrens got %v", f.Childrens) + } else if !slices.Equal(f.Childrens, expectedChildrens[:]) { + t.Errorf("childrens don't match, expected %v got %v", expectedChildrens, f.Childrens) + } + } + + t.Run("Parse", func(t *testing.T) { + t.Parallel() + b, err := blocks.NewBlockWithCid(data, c) + if err != nil { + t.Fatal(err) + } + a, err := Parse(b) + if err != nil { + t.Fatal(err) + } + f, ok := a.(File[string, string]) + if !ok { + t.Fatalf("expected File got %T", a) + } + validate(t, f) + }) + t.Run("ParseAppend", func(t *testing.T) { + t.Parallel() + var someArr [3]FileEntry[string] + typ, f, _, _, err := ParseAppend(someArr[:1], nil, c, data) + if err != nil { + t.Fatal(err) + } + if typ != TFile { + t.Fatalf("expected %v got %v", TFile, typ) + } + validate(t, f) + // Check someArr[1] to ensure it doesn't touch already existing entries before len. + if &someArr[1] != &f.Childrens[:1][0] { + t.Fatal("expected pointers to still be aliased but they are not") + } + }) +}