Skip to content

Commit

Permalink
Minor improvement for reduxer
Browse files Browse the repository at this point in the history
  • Loading branch information
waybackarchiver committed Feb 16, 2022
1 parent 6089857 commit b072a7d
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 40 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ require (
github.com/wabarc/playback v0.0.0-20210718054702-cab6c6004933
github.com/wabarc/rivet v0.0.0-20220207154318-37fc56bcf4e1
github.com/wabarc/screenshot v1.4.1-0.20211226132820-f5eed318376e
github.com/wabarc/telegra.ph v0.0.0-20210822083402-82f95ce60a37
github.com/wabarc/telegra.ph v0.0.0-20220216145835-479d23542bfc
github.com/wabarc/warcraft v0.2.2-0.20211107142816-7beea5a75ab5
go.etcd.io/bbolt v1.3.6
golang.org/x/net v0.0.0-20220121210141-e204ce36a2ba
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -713,8 +713,8 @@ github.com/wabarc/rivet v0.0.0-20220207154318-37fc56bcf4e1/go.mod h1:aWsIBu+Jr99
github.com/wabarc/screenshot v1.3.1/go.mod h1:ei8rqXW5mdztkqcsb81YUVwBZFROgyjAQJrEKEiMWfY=
github.com/wabarc/screenshot v1.4.1-0.20211226132820-f5eed318376e h1:8QUF3oJ/u4doLXiG5gwIu+NUrfTnusGfjzTz4mVZ1p0=
github.com/wabarc/screenshot v1.4.1-0.20211226132820-f5eed318376e/go.mod h1:HhtMtB0tOiUId8zteVvMQDfJJ1Wa/c3Mg5KKrWNYlrs=
github.com/wabarc/telegra.ph v0.0.0-20210822083402-82f95ce60a37 h1:lqVzAnARDEJO+bTtQiCRu2lg55QxboohtP/RZN9Y9mU=
github.com/wabarc/telegra.ph v0.0.0-20210822083402-82f95ce60a37/go.mod h1:532VM0F+WU2TSVvolJN3U5xihKLBt8ubLqyfWExM7As=
github.com/wabarc/telegra.ph v0.0.0-20220216145835-479d23542bfc h1:r2howsA9nt3I1GiPKeG1FO1S2rLTJsICFSX6CcXkxos=
github.com/wabarc/telegra.ph v0.0.0-20220216145835-479d23542bfc/go.mod h1:A8Q31u/3x1+v+oGPtQE9xp1+fN9WDePAI9OS7RCOhyY=
github.com/wabarc/warcraft v0.2.2-0.20211107142816-7beea5a75ab5 h1:jY/jqIy/ddCMWWWuTIeAazE5F4QW8HAIvlI69XMJ1ew=
github.com/wabarc/warcraft v0.2.2-0.20211107142816-7beea5a75ab5/go.mod h1:/BbCwReBjlqHRaw8Yh+7sfAicOesiMYNhiFpuL1x8Rc=
github.com/whyrusleeping/tar-utils v0.0.0-20180509141711-8c6c8ba81d5c/go.mod h1:xxcJeBb7SIUl/Wzkz1eVKJE/CB34YNrqX2TQI6jY9zs=
Expand Down
44 changes: 28 additions & 16 deletions reduxer/reduxer.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,40 +72,53 @@ type Remote struct {
type Bundles map[string]*Bundle

// Get returns a Bundle by given name.
func (b Bundles) Get(name string) *Bundle {
return b[name]
func (bs Bundles) Get(name string) (bundle *Bundle) {
if b := bs[name]; b != nil {
bundle = b
}
return
}

// Shot returns a screenshot.Screenshots from Bundle.
func (b *Bundle) Shot() (s screenshot.Screenshots) {
if b != nil {
return screenshot.Screenshots{
URL: b.URL,
Title: b.Title,
Image: b.Image,
HTML: b.HTML,
PDF: b.PDF,
}
}
return
}

// Do executes secreenshot, print PDF and export html of given URLs
// Returns a set of bundle containing screenshot data and file path
// nolint:gocyclo
func Do(ctx context.Context, urls ...*url.URL) (Bundles, error) {
bundles := make(Bundles, len(urls))
if !config.Opts.EnabledReduxer() {
return nil, errors.New("Specify directory to environment `WAYBACK_STORAGE_DIR` to enable reduxer")
return bundles, errors.New("Specify directory to environment `WAYBACK_STORAGE_DIR` to enable reduxer")
}

shots, err := capture(ctx, urls...)
if err != nil {
return nil, err
return bundles, err
}

dir, err := createDir(config.Opts.StorageDir())
if err != nil {
return nil, err
return bundles, err
}

var wg sync.WaitGroup
var mu sync.Mutex
var warc = &warcraft.Warcraft{BasePath: dir, UserAgent: config.Opts.WaybackUserAgent()}
var craft = func(in string) string {
u, err := url.Parse(in)
if err != nil {
logger.Debug("create warc for %s failed", u.String())
return ""
}
path, err := warc.Download(ctx, u)
var craft = func(in *url.URL) string {
path, err := warc.Download(ctx, in)
if err != nil {
logger.Debug("create warc for %s failed: %v", u.String(), err)
logger.Debug("create warc for %s failed: %v", in.String(), err)
return ""
}
return path
Expand All @@ -116,7 +129,6 @@ func Do(ctx context.Context, urls ...*url.URL) (Bundles, error) {
buf []byte
}

bundles := make(Bundles)
for _, shot := range shots {
wg.Add(1)
go func(shot screenshot.Screenshots) {
Expand All @@ -129,6 +141,7 @@ func Do(ctx context.Context, urls ...*url.URL) (Bundles, error) {
{key: &assets.Raw, buf: shot.HTML},
{key: &assets.HAR, buf: shot.HAR},
}
u, _ := url.Parse(shot.URL)
for _, slug := range slugs {
if slug.buf == nil {
logger.Warn("file empty, skipped")
Expand All @@ -152,13 +165,12 @@ func Do(ctx context.Context, urls ...*url.URL) (Bundles, error) {
}
}
// Set path of WARC file directly to avoid read file as buffer
if err := helper.SetField(&assets.WARC, "Local", craft(shot.URL)); err != nil {
if err := helper.SetField(&assets.WARC, "Local", craft(u)); err != nil {
logger.Error("assign field WARC to path struct failed: %v", err)
}
if err := helper.SetField(&assets.Media, "Local", media(ctx, dir, shot.URL)); err != nil {
logger.Error("assign field Media to path struct failed: %v", err)
}
u, _ := url.Parse(shot.URL)
article, err := readability.FromReader(bytes.NewReader(shot.HTML), u)
if err != nil {
logger.Error("parse html failed: %v", err)
Expand Down
27 changes: 6 additions & 21 deletions wayback.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ import (
"github.com/wabarc/logger"
"github.com/wabarc/playback"
"github.com/wabarc/rivet/ipfs"
"github.com/wabarc/screenshot"
"github.com/wabarc/wayback/config"
"github.com/wabarc/wayback/errors"
"github.com/wabarc/wayback/reduxer"
Expand Down Expand Up @@ -119,16 +118,14 @@ func (i IP) Wayback() string {
secret := config.Opts.IPFSSecret()
opts = append(opts, ipfs.Uses(target), ipfs.Apikey(apikey), ipfs.Secret(secret))
}

arc := &ip.Shaft{Hold: ipfs.Options(opts...)}
ctx := i.ctx

// If there is bundled HTML, it is utilized as the basis for IPFS
// archiving and is sent to obelisk to crawl the rest of the page.
if i.bundle != nil {
i.ctx = arc.WithInput(i.ctx, i.bundle.HTML)
}
ctx = arc.WithInput(i.ctx, i.bundle.HTML)

dst, err := arc.Wayback(i.ctx, i.URL)
dst, err := arc.Wayback(ctx, i.URL)
if err != nil {
logger.Error("wayback %s to IPFS failed: %v", i.URL.String(), err)
return fmt.Sprint(err)
Expand All @@ -140,32 +137,20 @@ func (i IP) Wayback() string {
// it reads URL from the PH and returns archived URL as a string.
func (i PH) Wayback() string {
arc := &ph.Archiver{}
arc.SetShot(i.parseShot())
ctx := arc.WithShot(i.ctx, i.bundle.Shot())
if config.Opts.EnabledChromeRemote() {
arc.ByRemote(config.Opts.ChromeRemoteAddr())
}
ctx = arc.WithArticle(ctx, i.bundle.Article)

dst, err := arc.Wayback(i.ctx, i.URL)
dst, err := arc.Wayback(ctx, i.URL)
if err != nil {
logger.Error("wayback %s to telegra.ph failed: %v", i.URL.String(), err)
return fmt.Sprint(err)
}
return dst
}

func (i PH) parseShot() (shot screenshot.Screenshots) {
if i.bundle != nil {
shot = screenshot.Screenshots{
URL: i.bundle.URL,
Title: i.bundle.Title,
Image: i.bundle.Image,
HTML: i.bundle.HTML,
PDF: i.bundle.PDF,
}
}
return
}

func wayback(w Waybacker) string {
return w.Wayback()
}
Expand Down

0 comments on commit b072a7d

Please sign in to comment.