Skip to content

Commit

Permalink
Add support for storing a page as a single file (#184)
Browse files Browse the repository at this point in the history
  • Loading branch information
waybackarchiver authored Jul 16, 2022
1 parent a206f89 commit 0067fb4
Show file tree
Hide file tree
Showing 14 changed files with 190 additions and 38 deletions.
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ require (
github.com/fatih/color v1.13.0
github.com/gabriel-vasile/mimetype v1.4.0
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b
github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7
github.com/google/go-github/v40 v40.0.0
github.com/gorilla/mux v1.8.0
github.com/gorilla/websocket v1.5.0
Expand All @@ -37,7 +38,7 @@ require (
github.com/wabarc/imgbb v1.0.0
github.com/wabarc/ipfs-pinner v1.1.1-0.20220126131044-16299c0dd43d
github.com/wabarc/logger v0.0.0-20210730133522-86bd3f31e792
github.com/wabarc/playback v0.0.0-20220610121013-427fb317031c
github.com/wabarc/playback v0.0.0-20220715111526-90d0327d3f04
github.com/wabarc/rivet v0.0.0-20220207154318-37fc56bcf4e1
github.com/wabarc/screenshot v1.5.1-0.20220318140348-632a135d50db
github.com/wabarc/telegra.ph v0.0.0-20220501011455-b0b8c35c6d09
Expand Down Expand Up @@ -68,7 +69,6 @@ require (
github.com/dlclark/regexp2 v1.4.1-0.20201116162257-a2a8dda75c91 // indirect
github.com/dop251/goja v0.0.0-20211211112501-fb27c91c26ed // indirect
github.com/go-shiori/dom v0.0.0-20210627111528-4e4722cd0d65 // indirect
github.com/go-shiori/obelisk v0.0.0-20220314133127-347d73aad05b // indirect
github.com/go-sourcemap/sourcemap v2.1.3+incompatible // indirect
github.com/gobwas/httphead v0.1.0 // indirect
github.com/gobwas/pool v0.2.1 // indirect
Expand Down
7 changes: 4 additions & 3 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -230,8 +230,9 @@ github.com/go-shiori/go-readability v0.0.0-20210627123243-82cc33435520/go.mod h1
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b h1:yrGomo5CP7IvXwSwKbDeaJkhwa4BxfgOO/s1V7iOQm4=
github.com/go-shiori/go-readability v0.0.0-20220215145315-dd6828d2f09b/go.mod h1:LTRGsNyO3/Y6u3ERbz17OiXy2qO1Y+/8QjXpg2ViyEY=
github.com/go-shiori/obelisk v0.0.0-20201115143556-8de0d40b0a9b/go.mod h1:OlGUo0utWqPMcej0HirsoK2+65Je/ZPFNwasF/O3KYM=
github.com/go-shiori/obelisk v0.0.0-20220314133127-347d73aad05b h1:k9qMjvbfeCA4glJTSP6TC++R/D5z7UNLemc3zX76sBI=
github.com/go-shiori/obelisk v0.0.0-20220314133127-347d73aad05b/go.mod h1:qKa73D7hc0YucHndvsCOgZ5Ap54XgSmZxaIytNAFUAQ=
github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7 h1:+TWg0Pe3/7YUbL0MuF4O/PdN+68M4HsUt1GyER/pvbU=
github.com/go-shiori/obelisk v0.0.0-20220524135250-3d6752a59bd7/go.mod h1:qKa73D7hc0YucHndvsCOgZ5Ap54XgSmZxaIytNAFUAQ=
github.com/go-sourcemap/sourcemap v2.1.3+incompatible h1:W1iEw64niKVGogNgBN3ePyLFfuisuzeidWPMPWmECqU=
github.com/go-sourcemap/sourcemap v2.1.3+incompatible/go.mod h1:F8jJfvm2KbVjc5NqelyYJmf/v5J0dwNLS2mL4sNA1Jg=
github.com/go-stack/stack v1.8.0/go.mod h1:v0f6uXyyMGvRgIKkXu+yp6POWl0qKG85gN/melR3HDY=
Expand Down Expand Up @@ -719,8 +720,8 @@ github.com/wabarc/logger v0.0.0-20210730133522-86bd3f31e792 h1:0xD4NsQtdPCB77q4w
github.com/wabarc/logger v0.0.0-20210730133522-86bd3f31e792/go.mod h1:2an5YHL0CegFbaDNSU3kwqYRfxFfmcwlwSi5Bk4yk5c=
github.com/wabarc/memento v0.0.0-20210703205719-adc2f8ab8bae h1:7TLEQYM7GxeysDd6IbK5F7krZuYEipdXno9qV3eedWU=
github.com/wabarc/memento v0.0.0-20210703205719-adc2f8ab8bae/go.mod h1:qP6GisnqoSDc5Ivj34yV+Qar3B6tYyLDKx42oCDX1zY=
github.com/wabarc/playback v0.0.0-20220610121013-427fb317031c h1:8a4FvrKx6esUHAX3kU5yioPZZ9YTUU7ojDSCsPd47eg=
github.com/wabarc/playback v0.0.0-20220610121013-427fb317031c/go.mod h1:rxvBzfzci+rMEEsupy1yo58Nbg9KtUmY4WJ67EPzNH8=
github.com/wabarc/playback v0.0.0-20220715111526-90d0327d3f04 h1:D1XXhpIu2DO0LEvqnVYYdH7x5cEdADDEk6GIZoW4eow=
github.com/wabarc/playback v0.0.0-20220715111526-90d0327d3f04/go.mod h1:rxvBzfzci+rMEEsupy1yo58Nbg9KtUmY4WJ67EPzNH8=
github.com/wabarc/rivet v0.0.0-20220207154318-37fc56bcf4e1 h1:4TKACQ7hhNwQ4SUaIVXAibaqSs5r/4CrgaYL/+HtEak=
github.com/wabarc/rivet v0.0.0-20220207154318-37fc56bcf4e1/go.mod h1:aWsIBu+Jr99P+G8KZ/NS/e6Yr5OEamt3i397U31QRu8=
github.com/wabarc/screenshot v1.5.0/go.mod h1:zacDw0LvNwstoCSqu2BYm8drXYOZ877TfDE6KhBEswY=
Expand Down
7 changes: 7 additions & 0 deletions reduxer/example.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,13 @@ func BundleExample() Reduxer {
Catbox: "https://files.catbox.moe/3agtva.har",
},
},
HTM: Asset{
Local: "/path/to/single-htm",
Remote: Remote{
Anonfile: "https://anonfiles.com/v4G4S09abc",
Catbox: "",
},
},
WARC: Asset{
Local: "/path/to/warc",
Remote: Remote{
Expand Down
57 changes: 50 additions & 7 deletions reduxer/reduxer.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"github.com/dustin/go-humanize"
"github.com/gabriel-vasile/mimetype"
"github.com/go-shiori/go-readability"
"github.com/go-shiori/obelisk"
"github.com/iawia002/lux/downloader"
"github.com/iawia002/lux/extractors"
"github.com/wabarc/go-anonfile"
Expand All @@ -37,6 +38,10 @@ import (
)

var (
ctxBasenameKey struct{}

filePerm = os.FileMode(0o600)

_, existFFmpeg = exists("ffmpeg")
youget, existYouGet = exists("you-get")
ytdl, existYoutubeDL = exists("youtube-dl")
Expand Down Expand Up @@ -65,13 +70,13 @@ type bundle struct {

// Artifact represents the file paths stored on the local disk.
type Artifact struct {
Img, PDF, Raw, Txt, HAR, WARC, Media Asset
Img, PDF, Raw, Txt, HAR, HTM, WARC, Media Asset
}

// Asset represents the files on the local disk and the remote servers.
type Asset struct {
Local string
Remote Remote
Local string
}

// Remote represents the file on the remote server.
Expand Down Expand Up @@ -184,7 +189,7 @@ func Do(ctx context.Context, urls ...*url.URL) (Reduxer, error) {
fp = strings.TrimSuffix(fp, ".json") + mt.Extension()
}
logger.Debug("writing file: %s", fp)
if err := os.WriteFile(fp, buf, 0o600); err != nil {
if err := os.WriteFile(fp, buf, filePerm); err != nil {
return errors.Wrap(err, fmt.Sprintf("write %s file failed", ft))
}
if err := helper.SetField(key, "Local", fp); err != nil {
Expand All @@ -201,6 +206,10 @@ func Do(ctx context.Context, urls ...*url.URL) (Reduxer, error) {
var artifact Artifact
u, _ := url.Parse(shot.URL)

basename := strings.TrimRight(helper.FileName(shot.URL, ""), ".html")
basename = strings.TrimRight(basename, ".htm")
ctx = context.WithValue(ctx, ctxBasenameKey, basename)

if err := assign(&artifact.Img, shot.Image, shot.URL); err != nil {
logger.Error("assign field Img to path struct failed: %v", err)
}
Expand All @@ -220,13 +229,18 @@ func Do(ctx context.Context, urls ...*url.URL) (Reduxer, error) {
if err := helper.SetField(&artifact.Media, "Local", media(ctx, dir, shot.URL)); err != nil {
logger.Error("assign field Media to path struct failed: %v", err)
}
// Attach single file
singleFilePath := singleFile(ctx, bytes.NewReader(shot.HTML), dir, shot.URL)
if err := helper.SetField(&artifact.HTM, "Local", singleFilePath); err != nil {
logger.Error("assign field HTM to path struct failed: %v", err)
}
article, err := readability.FromReader(bytes.NewReader(shot.HTML), u)
if err != nil {
logger.Error("parse html failed: %v", err)
}
fn := strings.TrimRight(helper.FileName(shot.URL, ""), "html") + "txt"
fp := filepath.Join(dir, fn)
if err := os.WriteFile(fp, helper.String2Byte(article.TextContent), 0o600); err == nil && article.TextContent != "" {
txtName := basename + ".txt"
fp := filepath.Join(dir, txtName)
if err := os.WriteFile(fp, helper.String2Byte(article.TextContent), filePerm); err == nil && article.TextContent != "" {
if err := helper.SetField(&artifact.Txt, "Local", fp); err != nil {
logger.Error("assign field Txt to artifact struct failed: %v", err)
}
Expand Down Expand Up @@ -342,7 +356,7 @@ func exists(tool string) (string, bool) {
// nolint:gocyclo
func media(ctx context.Context, dir, in string) string {
logger.Debug("download media to %s, url: %s", dir, in)
fn := strings.TrimSuffix(helper.FileName(in, ""), ".html")
fn := basename(ctx)
fp := filepath.Join(dir, fn)

// Glob files by given pattern and return first file
Expand Down Expand Up @@ -508,6 +522,7 @@ func remotely(ctx context.Context, artifact *Artifact) error {
&artifact.Raw,
&artifact.Txt,
&artifact.HAR,
&artifact.HTM,
&artifact.WARC,
&artifact.Media,
}
Expand Down Expand Up @@ -553,6 +568,34 @@ func remotely(ctx context.Context, artifact *Artifact) error {
return nil
}

func singleFile(ctx context.Context, inp io.Reader, dir, uri string) string {
req := obelisk.Request{URL: uri, Input: inp}
arc := &obelisk.Archiver{
SkipResourceURLError: true,
RequestTimeout: 3 * time.Second,
}
arc.Validate()

content, _, err := arc.Archive(ctx, req)
if err != nil {
return ""
}

name := basename(ctx) + ".htm"
path := filepath.Join(dir, name)
if err := os.WriteFile(path, content, filePerm); err != nil {
return ""
}
return path
}

func basename(ctx context.Context) string {
if v, ok := ctx.Value(ctxBasenameKey).(string); ok {
return v
}
return ""
}

func readOutput(rc io.ReadCloser) {
for {
out := make([]byte, 1024)
Expand Down
92 changes: 92 additions & 0 deletions reduxer/reduxer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,83 @@
package reduxer // import "github.com/wabarc/wayback/reduxer"

import (
"bufio"
"bytes"
"context"
"image"
"image/color"
"image/png"
"net/http"
"net/url"
"os"
"os/exec"
"path/filepath"
"strings"
"testing"

"github.com/wabarc/helper"
"github.com/wabarc/wayback/config"
)

const content = `<html>
<head>
<title>Example Domain</title>
</head>
<body>
<div>
<h1>Example Domain</h1>
<p>This domain is for use in illustrative examples in documents. You may use this
domain in literature without prior coordination or asking for permission.</p>
<p><a href="https://www.iana.org/domains/example">More information...</a></p>
<p><img src="/image.png"></p>
</div>
</body>
</html>
`

func genImage(height, width int) bytes.Buffer {
upLeft := image.Point{0, 0}
lowRight := image.Point{width, height}

img := image.NewRGBA(image.Rectangle{upLeft, lowRight})

// Colors are defined by Red, Green, Blue, Alpha uint8 values.
cyan := color.RGBA{100, 200, 200, 0xff}

// Set color for each pixel.
for x := 0; x < width; x++ {
for y := 0; y < height; y++ {
switch {
case x < width/2 && y < height/2: // upper left quadrant
img.Set(x, y, cyan)
case x >= width/2 && y >= height/2: // lower right quadrant
img.Set(x, y, color.White)
default:
// Use zero value.
}
}
}

var b bytes.Buffer
f := bufio.NewWriter(&b)
png.Encode(f, img) // Encode as PNG.

return b
}

func handleResponse(w http.ResponseWriter, r *http.Request) {
switch r.URL.Path {
case "/":
w.Header().Set("Content-Type", "text/html")
_, _ = w.Write([]byte(content))
case "/image.png":
buf := genImage(36, 36)
w.Header().Set("Content-Type", "image/png")
_, _ = w.Write(buf.Bytes())
}
}

func TestDo(t *testing.T) {
binPath := helper.FindChromeExecPath()
if _, err := exec.LookPath(binPath); err != nil {
Expand Down Expand Up @@ -81,3 +147,29 @@ func TestCreateDir(t *testing.T) {
}
defer file.Close()
}

func TestSingleFile(t *testing.T) {
dir, err := os.MkdirTemp(os.TempDir(), "reduxer-")
if err != nil {
t.Fatalf(`Unexpected create temp dir: %v`, err)
}
defer os.RemoveAll(dir)

_, mux, server := helper.MockServer()
mux.HandleFunc("/", handleResponse)
defer server.Close()

exp := `<img src="data:image/png;base64,`
if strings.Contains(content, exp) {
t.Fatal(`unexpected sample html page`)
}

uri := server.URL
filename := helper.RandString(5, "")
ctx := context.WithValue(context.Background(), ctxBasenameKey, filename)
got := singleFile(ctx, strings.NewReader(content), dir, uri)
buf, _ := os.ReadFile(got)
if !strings.Contains(string(buf), exp) {
t.Fatal(`unexpected archive webpage as a single file`)
}
}
1 change: 1 addition & 0 deletions service/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ func filterArtifact(art reduxer.Artifact, upper int64) (paths []string) {
art.Raw,
art.Txt,
art.HAR,
art.HTM,
art.WARC,
art.Media,
}
Expand Down
6 changes: 4 additions & 2 deletions template/render/github.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,13 @@ func (gh *GitHub) parseArtifact(assets reduxer.Artifact, tmplBytes *bytes.Buffer
tmpl := `**[AnonFiles](https://anonfiles.com/)** - [ [IMG]({{ .Img.Remote.Anonfile | url -}}
) ¦ [PDF]({{ .PDF.Remote.Anonfile | url }}) ¦ [RAW]({{ .Raw.Remote.Anonfile | url -}}
) ¦ [TXT]({{ .Txt.Remote.Anonfile | url }}) ¦ [HAR]({{ .HAR.Remote.Anonfile | url -}}
) ¦ [WARC]({{ .WARC.Remote.Anonfile | url }}) ¦ [MEDIA]({{ .Media.Remote.Anonfile | url }}) ]
) ¦ [HTM]({{ .HTM.Remote.Anonfile | url }}) ¦ [WARC]({{ .WARC.Remote.Anonfile | url -}}
) ¦ [MEDIA]({{ .Media.Remote.Anonfile | url }}) ]
**[Catbox](https://catbox.moe/)** - [ [IMG]({{ .Img.Remote.Catbox | url -}}
) ¦ [PDF]({{ .PDF.Remote.Catbox | url }}) ¦ [RAW]({{ .Raw.Remote.Catbox | url -}}
) ¦ [TXT]({{ .Txt.Remote.Catbox | url }}) ¦ [HAR]({{ .HAR.Remote.Catbox | url -}}
) ¦ [WARC]({{ .WARC.Remote.Catbox | url }}) ¦ [MEDIA]({{ .Media.Remote.Catbox | url }}) ]`
) ¦ [HTM]({{ .HTM.Remote.Catbox | url }}) ¦ [WARC]({{ .WARC.Remote.Catbox | url -}}
) ¦ [MEDIA]({{ .Media.Remote.Catbox | url }}) ]`

tpl, err := template.New("assets").Funcs(funcMap()).Parse(tmpl)
if err != nil {
Expand Down
4 changes: 2 additions & 2 deletions template/render/github_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ More information...
> source: [https://example.com/](https://example.com/)
> archived: [http://telegra.ph/title-01-01](http://telegra.ph/title-01-01)
**[AnonFiles](https://anonfiles.com/)** - [ [IMG](https://anonfiles.com/FbZfSa9eu4) ¦ [PDF](https://anonfiles.com/r4G8Sb90ud) ¦ [RAW](https://anonfiles.com/pbG4Se94ua) ¦ [TXT](https://anonfiles.com/naG6S09bu1) ¦ [HAR](https://anonfiles.com/n1paZfB3ub) ¦ [WARC](https://anonfiles.com/v4G4S09auc) ¦ [MEDIA]() ]
**[Catbox](https://catbox.moe/)** - [ [IMG](https://files.catbox.moe/9u6yvu.png) ¦ [PDF](https://files.catbox.moe/q73uqh.pdf) ¦ [RAW](https://files.catbox.moe/bph1g6.htm) ¦ [TXT](https://files.catbox.moe/wwrby6.txt) ¦ [HAR](https://files.catbox.moe/3agtva.har) ¦ [WARC]() ¦ [MEDIA]() ]`
**[AnonFiles](https://anonfiles.com/)** - [ [IMG](https://anonfiles.com/FbZfSa9eu4) ¦ [PDF](https://anonfiles.com/r4G8Sb90ud) ¦ [RAW](https://anonfiles.com/pbG4Se94ua) ¦ [TXT](https://anonfiles.com/naG6S09bu1) ¦ [HAR](https://anonfiles.com/n1paZfB3ub) ¦ [HTM](https://anonfiles.com/v4G4S09abc) ¦ [WARC](https://anonfiles.com/v4G4S09auc) ¦ [MEDIA]() ]
**[Catbox](https://catbox.moe/)** - [ [IMG](https://files.catbox.moe/9u6yvu.png) ¦ [PDF](https://files.catbox.moe/q73uqh.pdf) ¦ [RAW](https://files.catbox.moe/bph1g6.htm) ¦ [TXT](https://files.catbox.moe/wwrby6.txt) ¦ [HAR](https://files.catbox.moe/3agtva.har) ¦ [HTM]() ¦ [WARC]() ¦ [MEDIA]() ]`

got := ForPublish(&GitHub{Cols: collects, Data: bundleExample}).String()
if got != expected {
Expand Down
6 changes: 4 additions & 2 deletions template/render/matrix.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,13 @@ func (m *Matrix) parseArtifact(assets reduxer.Artifact, tmplBytes *bytes.Buffer)
tmpl := `<b><a href="https://anonfiles.com/">AnonFiles</a></b> - [ <a href="{{ .Img.Remote.Anonfile | url -}}
">IMG</a> ¦ <a href="{{ .PDF.Remote.Anonfile | url }}">PDF</a> ¦ <a href="{{ .Raw.Remote.Anonfile | url -}}
">RAW</a> ¦ <a href="{{ .Txt.Remote.Anonfile | url }}">TXT</a> ¦ <a href="{{ .HAR.Remote.Anonfile | url -}}
">HAR</a> ¦ <a href="{{ .WARC.Remote.Anonfile | url }}">WARC</a> ¦ <a href="{{ .Media.Remote.Anonfile | url }}">MEDIA</a> ]<br>
">HAR</a> ¦ <a href="{{ .HTM.Remote.Anonfile | url }}">HTM</a> ¦ <a href="{{ .WARC.Remote.Anonfile | url -}}
">WARC</a> ¦ <a href="{{ .Media.Remote.Anonfile | url }}">MEDIA</a> ]<br>
<b><a href="https://catbox.moe/">Catbox</a></b> - [ <a href="{{ .Img.Remote.Catbox | url -}}
">IMG</a> ¦ <a href="{{ .PDF.Remote.Catbox | url }}">PDF</a> ¦ <a href="{{ .Raw.Remote.Catbox | url -}}
">RAW</a> ¦ <a href="{{ .Txt.Remote.Catbox | url }}">TXT</a> ¦ <a href="{{ .HAR.Remote.Catbox | url -}}
">HAR</a> ¦ <a href="{{ .WARC.Remote.Catbox | url }}">WARC</a> ¦ <a href="{{ .Media.Remote.Catbox | url }}">MEDIA</a> ]`
">HAR</a> ¦ <a href="{{ .HTM.Remote.Catbox | url }}">HTM</a> ¦ <a href="{{ .WARC.Remote.Catbox | url -}}
">WARC</a> ¦ <a href="{{ .Media.Remote.Catbox | url }}">MEDIA</a> ]`

tpl, err := template.New("assets").Funcs(funcMap()).Parse(tmpl)
if err != nil {
Expand Down
Loading

0 comments on commit 0067fb4

Please sign in to comment.