Skip to content

Commit

Permalink
linking: add LoadRaw and LoadPlusRaw functions to LinkSystem.
Browse files Browse the repository at this point in the history
These allow getting raw byte slices back along with the processed Node,
or just the raw byte slice alone.

Also added a great deal of documentation, including to existing
functions which were lacking it.
  • Loading branch information
warpfork committed Oct 14, 2021
1 parent 273362c commit b1e1162
Showing 1 changed file with 129 additions and 5 deletions.
134 changes: 129 additions & 5 deletions linking/functions.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package linking

import (
"bytes"
"context"
"io"

Expand All @@ -12,6 +13,7 @@ import (

// Varations:
// - Load vs Store vs ComputeLink
// - Load vs LoadPlusRaw
// - With or without LinkContext?
// - Brevity would be nice but I can't think of what to name the functions, so: everything takes LinkContext. Zero value is fine though.
// - [for load direction only]: Prototype (and return Node|error) or Assembler (and just return error)?
Expand All @@ -21,6 +23,31 @@ import (
// Can we get as far as a `QuickLoad(lnk Link) (Node, error)` function, which doesn't even ask you for a NodePrototype?
// No, not quite. (Alas.) If we tried to do so, and make it use `basicnode.Prototype`, we'd have import cycles; ded.

// Load looks up some data identified by a Link, and does everything necessary to turn it into usable data.
// In detail, that means it:
// brings that data into memory,
// verifies the hash,
// parses it into the Data Model using a codec,
// and returns an IPLD Node.
//
// Where the data will be loaded from is determined by the configuration of the LinkSystem
// (namely, the StorageReadOpener callback, which can either be set directly,
// or configured via the SetReadStorage function).
//
// The in-memory form used for the returned Node is determined by the given NodePrototype parameter.
// A new builder and a new node will be allocated, via NodePrototype.NewBuilder.
// (If you'd like more control over memory allocation, you may wish to see the Fill function instead.)
//
// A schema may also be used, and apply additional data validation during loading,
// by using a schema.TypedNodePrototype as the NodePrototype argument.
//
// The LinkContext parameter may be used to pass contextual information down to the loading layer.
//
// Which hashing function is used to validate the loaded data is determined by LinkSystem.HasherChooser.
// Which codec is used to parse the loaded data into the Data Model is determined by LinkSystem.DecoderChooser.
//
// The LinkSystem.NodeReifier callback is also applied before returning the Node,
// and so Load may also thereby return an ADL.
func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, error) {
nb := np.NewBuilder()
if err := lsys.Fill(lnkCtx, lnk, nb); err != nil {
Expand All @@ -33,6 +60,9 @@ func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamode
return lsys.NodeReifier(lnkCtx, nd, lsys)
}

// MustLoad is identical to Load, but panics in the case of errors.
//
// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided.
func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) datamodel.Node {
if n, err := lsys.Load(lnkCtx, lnk, np); err != nil {
panic(err)
Expand All @@ -41,6 +71,88 @@ func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np data
}
}

// LoadPlusRaw is similar to Load, but additionally retains and returns the byte slice of the raw data parsed.
//
// Be wary of using this with large data, since it will hold all data in memory at once.
// For more control over streaming, you may want to construct a LinkSystem where you wrap the storage opener callbacks,
// and thus can access the streams (and tee them, or whatever you need to do) as they're opened.
// This function is meant for convenience when data sizes are small enough that fitting them into memory at once is not a problem.
func (lsys *LinkSystem) LoadPlusRaw(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, []byte, error) {
// Choose all the parts.
decoder, err := lsys.DecoderChooser(lnk)
if err != nil {
return nil, nil, ErrLinkingSetup{"could not choose a decoder", err}
}
// Use LoadRaw to get the data.
// If we're going to have everything in memory at once, we might as well do that first, and then give the codec and the hasher the whole thing at once.
block, err := lsys.LoadRaw(lnkCtx, lnk)
if err != nil {
return nil, block, err
}
// Create a NodeBuilder.
// Deploy the codec.
// Build the node.
nb := np.NewBuilder()
if err := decoder(nb, bytes.NewBuffer(block)); err != nil {
return nil, block, err
}
nd := nb.Build()
// Consider applying NodeReifier, if applicable.
if lsys.NodeReifier == nil {
return nd, block, nil
}
nd, err = lsys.NodeReifier(lnkCtx, nd, lsys)
return nd, block, err
}

// LoadRaw looks up some data identified by a Link, brings that data into memory,
// verifies the hash, and returns it directly as a byte slice.
//
// LoadRaw does not return a data model view of the data,
// nor does it verify that a codec can parse the data at all!
// Use this function at your own risk; it does not provide the same guarantees as the Load or Fill functions do.
func (lsys *LinkSystem) LoadRaw(lnkCtx LinkContext, lnk datamodel.Link) ([]byte, error) {
if lnkCtx.Ctx == nil {
lnkCtx.Ctx = context.Background()
}
// Choose all the parts.
hasher, err := lsys.HasherChooser(lnk.Prototype())
if err != nil {
return nil, ErrLinkingSetup{"could not choose a hasher", err}
}
if lsys.StorageReadOpener == nil {
return nil, ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause?
}
// Open storage: get the data.
// FUTURE: this could probably use storage.ReadableStorage.Get instead of streaming and a buffer, if we refactored LinkSystem to carry that interface through.
reader, err := lsys.StorageReadOpener(lnkCtx, lnk)
if err != nil {
return nil, err
}
var buf bytes.Buffer
if _, err := io.Copy(&buf, reader); err != nil {
return nil, err
}
// Compute the hash.
// (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.)
hasher.Write(buf.Bytes())
hash := hasher.Sum(nil)
lnk2 := lnk.Prototype().BuildLink(hash)
if lnk2 != lnk {
return nil, ErrHashMismatch{Actual: lnk2, Expected: lnk}
}
// No codec to deploy; this is the raw load function.
// So we're done.
return buf.Bytes(), nil
}

// Fill is similar to Load, but allows more control over memory allocations.
// Instead of taking a NodePrototype parameter, Fill takes a NodeAssembler parameter:
// this allows you to use your own NodeBuilder (and reset it, etc, thus controlling allocations),
// or, to fill in some part of a larger structure.
//
// Note that Fill does not regard NodeReifier, even if one has been configured.
// (This is in contrast to Load, which does regard a NodeReifier if one is configured, and thus may return an ADL node).
func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) error {
if lnkCtx.Ctx == nil {
lnkCtx.Ctx = context.Background()
Expand All @@ -57,38 +169,50 @@ func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamode
if lsys.StorageReadOpener == nil {
return ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause?
}
// Open storage, read it, verify it, and feed the codec to assemble the nodes.
// Open storage; get a reader stream.
reader, err := lsys.StorageReadOpener(lnkCtx, lnk)
if err != nil {
return err
}
// TrustaedStorage indicates the data coming out of this reader has already been hashed and verified earlier.
// TrustedStorage indicates the data coming out of this reader has already been hashed and verified earlier.
// As a result, we can skip rehashing it
if lsys.TrustedStorage {
return decoder(na, reader)
}
// Tee the stream so that the hasher is fed as the unmarshal progresses through the stream.
tee := io.TeeReader(reader, hasher)
// The actual read is then dragged forward by the codec.
decodeErr := decoder(na, tee)
if decodeErr != nil { // It is important to security to check the hash before returning any other observation about the content.
// This copy is for data remaining the block that wasn't already pulled through the TeeReader by the decoder.
if decodeErr != nil {
// It is important to security to check the hash before returning any other observation about the content,
// so, if the decode process returns any error, we have several steps to take before potentially returning it.
// First, we try to copy any data remaining that wasn't already pulled through the TeeReader by the decoder,
// so that the hasher can reach the end of the stream.
// If _that_ errors, return the I/O level error.
// We hang onto decodeErr for a while: we can't return that until all the way after we check the hash equality.
_, err := io.Copy(hasher, reader)
if err != nil {
return err
}
}
// Compute the hash.
// (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.)
hash := hasher.Sum(nil)
// Bit of a jig to get something we can do the hash equality check on.
lnk2 := lnk.Prototype().BuildLink(hash)
if lnk2 != lnk {
return ErrHashMismatch{Actual: lnk2, Expected: lnk}
}
// If we got all the way through IO and through the hash check:
// now, finally, if we did get an error from the codec, we can admit to that.
if decodeErr != nil {
return decodeErr
}
return nil
}

// MustFill is identical to Fill, but panics in the case of errors.
//
// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided.
func (lsys *LinkSystem) MustFill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) {
if err := lsys.Fill(lnkCtx, lnk, na); err != nil {
panic(err)
Expand Down

0 comments on commit b1e1162

Please sign in to comment.