From b1e11620a9c640792ae0a25cfe8d37c50bea421a Mon Sep 17 00:00:00 2001 From: Eric Myhre Date: Thu, 14 Oct 2021 19:54:07 +0200 Subject: [PATCH] linking: add LoadRaw and LoadPlusRaw functions to LinkSystem. These allow getting raw byte slices back along with the processed Node, or just the raw byte slice alone. Also added a great deal of documentation, including to existing functions which were lacking it. --- linking/functions.go | 134 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 129 insertions(+), 5 deletions(-) diff --git a/linking/functions.go b/linking/functions.go index db61121d..6f959287 100644 --- a/linking/functions.go +++ b/linking/functions.go @@ -1,6 +1,7 @@ package linking import ( + "bytes" "context" "io" @@ -12,6 +13,7 @@ import ( // Varations: // - Load vs Store vs ComputeLink +// - Load vs LoadPlusRaw // - With or without LinkContext? // - Brevity would be nice but I can't think of what to name the functions, so: everything takes LinkContext. Zero value is fine though. // - [for load direction only]: Prototype (and return Node|error) or Assembler (and just return error)? @@ -21,6 +23,31 @@ import ( // Can we get as far as a `QuickLoad(lnk Link) (Node, error)` function, which doesn't even ask you for a NodePrototype? // No, not quite. (Alas.) If we tried to do so, and make it use `basicnode.Prototype`, we'd have import cycles; ded. +// Load looks up some data identified by a Link, and does everything necessary to turn it into usable data. +// In detail, that means it: +// brings that data into memory, +// verifies the hash, +// parses it into the Data Model using a codec, +// and returns an IPLD Node. +// +// Where the data will be loaded from is determined by the configuration of the LinkSystem +// (namely, the StorageReadOpener callback, which can either be set directly, +// or configured via the SetReadStorage function). +// +// The in-memory form used for the returned Node is determined by the given NodePrototype parameter. +// A new builder and a new node will be allocated, via NodePrototype.NewBuilder. +// (If you'd like more control over memory allocation, you may wish to see the Fill function instead.) +// +// A schema may also be used, and apply additional data validation during loading, +// by using a schema.TypedNodePrototype as the NodePrototype argument. +// +// The LinkContext parameter may be used to pass contextual information down to the loading layer. +// +// Which hashing function is used to validate the loaded data is determined by LinkSystem.HasherChooser. +// Which codec is used to parse the loaded data into the Data Model is determined by LinkSystem.DecoderChooser. +// +// The LinkSystem.NodeReifier callback is also applied before returning the Node, +// and so Load may also thereby return an ADL. func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, error) { nb := np.NewBuilder() if err := lsys.Fill(lnkCtx, lnk, nb); err != nil { @@ -33,6 +60,9 @@ func (lsys *LinkSystem) Load(lnkCtx LinkContext, lnk datamodel.Link, np datamode return lsys.NodeReifier(lnkCtx, nd, lsys) } +// MustLoad is identical to Load, but panics in the case of errors. +// +// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided. func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) datamodel.Node { if n, err := lsys.Load(lnkCtx, lnk, np); err != nil { panic(err) @@ -41,6 +71,88 @@ func (lsys *LinkSystem) MustLoad(lnkCtx LinkContext, lnk datamodel.Link, np data } } +// LoadPlusRaw is similar to Load, but additionally retains and returns the byte slice of the raw data parsed. +// +// Be wary of using this with large data, since it will hold all data in memory at once. +// For more control over streaming, you may want to construct a LinkSystem where you wrap the storage opener callbacks, +// and thus can access the streams (and tee them, or whatever you need to do) as they're opened. +// This function is meant for convenience when data sizes are small enough that fitting them into memory at once is not a problem. +func (lsys *LinkSystem) LoadPlusRaw(lnkCtx LinkContext, lnk datamodel.Link, np datamodel.NodePrototype) (datamodel.Node, []byte, error) { + // Choose all the parts. + decoder, err := lsys.DecoderChooser(lnk) + if err != nil { + return nil, nil, ErrLinkingSetup{"could not choose a decoder", err} + } + // Use LoadRaw to get the data. + // If we're going to have everything in memory at once, we might as well do that first, and then give the codec and the hasher the whole thing at once. + block, err := lsys.LoadRaw(lnkCtx, lnk) + if err != nil { + return nil, block, err + } + // Create a NodeBuilder. + // Deploy the codec. + // Build the node. + nb := np.NewBuilder() + if err := decoder(nb, bytes.NewBuffer(block)); err != nil { + return nil, block, err + } + nd := nb.Build() + // Consider applying NodeReifier, if applicable. + if lsys.NodeReifier == nil { + return nd, block, nil + } + nd, err = lsys.NodeReifier(lnkCtx, nd, lsys) + return nd, block, err +} + +// LoadRaw looks up some data identified by a Link, brings that data into memory, +// verifies the hash, and returns it directly as a byte slice. +// +// LoadRaw does not return a data model view of the data, +// nor does it verify that a codec can parse the data at all! +// Use this function at your own risk; it does not provide the same guarantees as the Load or Fill functions do. +func (lsys *LinkSystem) LoadRaw(lnkCtx LinkContext, lnk datamodel.Link) ([]byte, error) { + if lnkCtx.Ctx == nil { + lnkCtx.Ctx = context.Background() + } + // Choose all the parts. + hasher, err := lsys.HasherChooser(lnk.Prototype()) + if err != nil { + return nil, ErrLinkingSetup{"could not choose a hasher", err} + } + if lsys.StorageReadOpener == nil { + return nil, ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause? + } + // Open storage: get the data. + // FUTURE: this could probably use storage.ReadableStorage.Get instead of streaming and a buffer, if we refactored LinkSystem to carry that interface through. + reader, err := lsys.StorageReadOpener(lnkCtx, lnk) + if err != nil { + return nil, err + } + var buf bytes.Buffer + if _, err := io.Copy(&buf, reader); err != nil { + return nil, err + } + // Compute the hash. + // (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.) + hasher.Write(buf.Bytes()) + hash := hasher.Sum(nil) + lnk2 := lnk.Prototype().BuildLink(hash) + if lnk2 != lnk { + return nil, ErrHashMismatch{Actual: lnk2, Expected: lnk} + } + // No codec to deploy; this is the raw load function. + // So we're done. + return buf.Bytes(), nil +} + +// Fill is similar to Load, but allows more control over memory allocations. +// Instead of taking a NodePrototype parameter, Fill takes a NodeAssembler parameter: +// this allows you to use your own NodeBuilder (and reset it, etc, thus controlling allocations), +// or, to fill in some part of a larger structure. +// +// Note that Fill does not regard NodeReifier, even if one has been configured. +// (This is in contrast to Load, which does regard a NodeReifier if one is configured, and thus may return an ADL node). func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) error { if lnkCtx.Ctx == nil { lnkCtx.Ctx = context.Background() @@ -57,38 +169,50 @@ func (lsys *LinkSystem) Fill(lnkCtx LinkContext, lnk datamodel.Link, na datamode if lsys.StorageReadOpener == nil { return ErrLinkingSetup{"no storage configured for reading", io.ErrClosedPipe} // REVIEW: better cause? } - // Open storage, read it, verify it, and feed the codec to assemble the nodes. + // Open storage; get a reader stream. reader, err := lsys.StorageReadOpener(lnkCtx, lnk) if err != nil { return err } - // TrustaedStorage indicates the data coming out of this reader has already been hashed and verified earlier. + // TrustedStorage indicates the data coming out of this reader has already been hashed and verified earlier. // As a result, we can skip rehashing it if lsys.TrustedStorage { return decoder(na, reader) } // Tee the stream so that the hasher is fed as the unmarshal progresses through the stream. tee := io.TeeReader(reader, hasher) + // The actual read is then dragged forward by the codec. decodeErr := decoder(na, tee) - if decodeErr != nil { // It is important to security to check the hash before returning any other observation about the content. - // This copy is for data remaining the block that wasn't already pulled through the TeeReader by the decoder. + if decodeErr != nil { + // It is important to security to check the hash before returning any other observation about the content, + // so, if the decode process returns any error, we have several steps to take before potentially returning it. + // First, we try to copy any data remaining that wasn't already pulled through the TeeReader by the decoder, + // so that the hasher can reach the end of the stream. + // If _that_ errors, return the I/O level error. + // We hang onto decodeErr for a while: we can't return that until all the way after we check the hash equality. _, err := io.Copy(hasher, reader) if err != nil { return err } } + // Compute the hash. + // (Then do a bit of a jig to build a link out of it -- because that's what we do the actual hash equality check on.) hash := hasher.Sum(nil) - // Bit of a jig to get something we can do the hash equality check on. lnk2 := lnk.Prototype().BuildLink(hash) if lnk2 != lnk { return ErrHashMismatch{Actual: lnk2, Expected: lnk} } + // If we got all the way through IO and through the hash check: + // now, finally, if we did get an error from the codec, we can admit to that. if decodeErr != nil { return decodeErr } return nil } +// MustFill is identical to Fill, but panics in the case of errors. +// +// This function is meant for convenience of use in test and demo code, but should otherwise probably be avoided. func (lsys *LinkSystem) MustFill(lnkCtx LinkContext, lnk datamodel.Link, na datamodel.NodeAssembler) { if err := lsys.Fill(lnkCtx, lnk, na); err != nil { panic(err)