-
Notifications
You must be signed in to change notification settings - Fork 44
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
support extraction of unixfs content stored in car files #263
Changes from 2 commits
92a65c1
e14a9dc
d492bd4
8517c62
52c898d
344571f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,192 @@ | ||
package main | ||
|
||
import ( | ||
"bytes" | ||
"fmt" | ||
"io" | ||
"os" | ||
"path" | ||
|
||
"github.com/ipfs/go-cid" | ||
"github.com/ipfs/go-unixfsnode" | ||
"github.com/ipfs/go-unixfsnode/data" | ||
"github.com/ipfs/go-unixfsnode/file" | ||
"github.com/ipld/go-car/v2/blockstore" | ||
dagpb "github.com/ipld/go-codec-dagpb" | ||
"github.com/ipld/go-ipld-prime" | ||
cidlink "github.com/ipld/go-ipld-prime/linking/cid" | ||
basicnode "github.com/ipld/go-ipld-prime/node/basic" | ||
"github.com/urfave/cli/v2" | ||
) | ||
|
||
// ExtractCar pulls files and directories out of a car | ||
func ExtractCar(c *cli.Context) error { | ||
if !c.IsSet("file") { | ||
return fmt.Errorf("a file source must be specified") | ||
} | ||
masih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
outputDir, err := os.Getwd() | ||
if err != nil { | ||
return err | ||
} | ||
if c.Args().Len() > 0 { | ||
masih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
outputDir = c.Args().First() | ||
} | ||
|
||
bs, err := blockstore.OpenReadOnly(c.String("file")) | ||
masih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if err != nil { | ||
return err | ||
} | ||
|
||
ls := cidlink.DefaultLinkSystem() | ||
ls.TrustedStorage = true | ||
ls.StorageReadOpener = func(_ ipld.LinkContext, l ipld.Link) (io.Reader, error) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. graphsync's storeutil.LinkSystemForBlockstore does this quite pleasantly -- sets trusted storage, adds a good storage read opener, even includes some optimizations for bytes.Buffer. We should really, really consider putting this in ipld-prime, or somewhere that's obviously accessable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's a glue module for this coming in ipld/go-ipld-prime#279 (which I haven't merged yet, but don't actually know what I'm waiting for, either), so we could use that shortly. |
||
cl, ok := l.(cidlink.Link) | ||
if !ok { | ||
return nil, fmt.Errorf("not a cidlink") | ||
} | ||
blk, err := bs.Get(cl.Cid) | ||
if err != nil { | ||
return nil, err | ||
} | ||
return bytes.NewBuffer(blk.RawData()), nil | ||
} | ||
|
||
roots, err := bs.Roots() | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, root := range roots { | ||
if err := extractRoot(c, &ls, root, outputDir); err != nil { | ||
return err | ||
} | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func extractRoot(c *cli.Context, ls *ipld.LinkSystem, root cid.Cid, outputDir string) error { | ||
if root.Prefix().Codec == cid.Raw { | ||
if c.IsSet("verbose") { | ||
fmt.Fprintf(os.Stderr, "skipping raw root %s\n", root) | ||
masih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return nil | ||
} | ||
|
||
pbn, err := ls.Load(ipld.LinkContext{}, cidlink.Link{Cid: root}, dagpb.Type.PBNode) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This code seems to assume that if you don't have a raw node, you have a root that is a UnixFS Directory, not a regular file (even though extractFile exists as a function). Is this the intent? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this is generally the case. if i export a single file it'll be implicitly in a directory in order to provide the file name of the file. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fwiw, to compare to other unix stuff, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. okay for that to go in a follow-up? |
||
if err != nil { | ||
return err | ||
} | ||
pbnode := pbn.(dagpb.PBNode) | ||
|
||
ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not using a the Reify feature of LinkSystem for some reason? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. have not entirely wrapped my head around all the code here, but it's possible that you could use I have not analyzed if that would make this code easier to reason about or reuse; just offering the possibility. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that would be a new coordination with unixfsnode somehow that seems harder to reason about though. I think this is a reasonable setup for applying the reification explicitly when needed. |
||
if err != nil { | ||
return err | ||
} | ||
|
||
if err := extractDir(c, ls, ufn, outputDir); err != nil { | ||
return fmt.Errorf("%s: %w", root, err) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
func extractDir(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputDir string) error { | ||
// make the directory. | ||
os.MkdirAll(outputDir, 0755) | ||
masih marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if n.Kind() == ipld.Kind_Map { | ||
mi := n.MapIterator() | ||
for !mi.Done() { | ||
key, val, err := mi.Next() | ||
if err != nil { | ||
return err | ||
} | ||
ks, err := key.AsString() | ||
if err != nil { | ||
return err | ||
} | ||
if c.IsSet("verbose") { | ||
fmt.Fprintf(os.Stdout, "%s\n", path.Join(outputDir, ks)) | ||
} | ||
|
||
if val.Kind() != ipld.Kind_Link { | ||
return fmt.Errorf("unexpected map value for %s at %s", ks, outputDir) | ||
} | ||
// a directory may be represented as a map of name:<link> if unixADL is applied | ||
vl, err := val.AsLink() | ||
if err != nil { | ||
return err | ||
} | ||
dest, err := ls.Load(ipld.LinkContext{}, vl, basicnode.Prototype.Any) | ||
if err != nil { | ||
return err | ||
} | ||
// degenerate files are handled here. | ||
if dest.Kind() == ipld.Kind_Bytes { | ||
if err := extractFile(c, ls, dest, path.Join(outputDir, ks)); err != nil { | ||
return err | ||
} | ||
continue | ||
} else { | ||
// dir / pbnode | ||
pbb := dagpb.Type.PBNode.NewBuilder() | ||
if err := pbb.AssignNode(dest); err != nil { | ||
return err | ||
} | ||
dest = pbb.Build() | ||
} | ||
pbnode := dest.(dagpb.PBNode) | ||
|
||
// interpret dagpb 'data' as unixfs data and look at type. | ||
ufsData, err := pbnode.LookupByString("Data") | ||
if err != nil { | ||
return err | ||
} | ||
ufsBytes, err := ufsData.AsBytes() | ||
if err != nil { | ||
return err | ||
} | ||
ufsNode, err := data.DecodeUnixFSData(ufsBytes) | ||
if err != nil { | ||
return err | ||
} | ||
if ufsNode.DataType.Int() == data.Data_Directory || ufsNode.DataType.Int() == data.Data_HAMTShard { | ||
ufn, err := unixfsnode.Reify(ipld.LinkContext{}, pbnode, ls) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
if err := extractDir(c, ls, ufn, path.Join(outputDir, ks)); err != nil { | ||
return err | ||
} | ||
} else if ufsNode.DataType.Int() == data.Data_File || ufsNode.DataType.Int() == data.Data_Raw { | ||
if err := extractFile(c, ls, pbnode, path.Join(outputDir, ks)); err != nil { | ||
return err | ||
} | ||
} else if ufsNode.DataType.Int() == data.Data_Symlink { | ||
data := ufsNode.Data.Must().Bytes() | ||
if err := os.Symlink(string(data), path.Join(outputDir, ks)); err != nil { | ||
return err | ||
} | ||
} | ||
} | ||
return nil | ||
} | ||
return fmt.Errorf("not a directory") | ||
} | ||
|
||
func extractFile(c *cli.Context, ls *ipld.LinkSystem, n ipld.Node, outputName string) error { | ||
node, err := file.NewUnixFSFile(c.Context, n, ls) | ||
if err != nil { | ||
return err | ||
} | ||
f, err := os.Create(outputName) | ||
if err != nil { | ||
return err | ||
} | ||
defer f.Close() | ||
_, err = io.Copy(f, node) | ||
|
||
return err | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
car create --file=out.car foo.txt bar.txt | ||
mkdir out | ||
car extract -v -f out.car out | ||
! stderr . | ||
stdout -count=2 'txt$' | ||
car create --file=out2.car out/foo.txt out/bar.txt | ||
cmp out.car out2.car | ||
|
||
-- foo.txt -- | ||
foo content | ||
-- bar.txt -- | ||
bar content |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at the code, it looks like if you give anything other than a car where the roots are DagPB/Raw, it will error. Perhaps it is worth putting a comment like this ("Extracts the contents of a car when the car encodes UnixFS data")
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should think about other conventions that we should try to interpret, like hamts, structured data, etc.