Skip to content

Commit

Permalink
Implement all text extractor
Browse files Browse the repository at this point in the history
  • Loading branch information
ernsheong committed Nov 15, 2017
1 parent f502e09 commit d14f394
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 5 deletions.
16 changes: 11 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

func main() {
splitPtr := flag.String("split", "word", "a string")
extractorPtr := flag.String("extractor", "shallow", "a string")
flag.Parse()

file, err := os.Open("sample/" + os.Args[len(os.Args)-1])
Expand All @@ -21,12 +22,17 @@ func main() {
}

var ex boilertext.Extractor
if *splitPtr == "word" {
ex = extractor.NewShallowTextExtractor(bufio.ScanWords)
} else if *splitPtr == "rune" {
ex = extractor.NewShallowTextExtractor(bufio.ScanRunes)
if *extractorPtr == "shallow" {
if *splitPtr == "word" {
ex = extractor.NewShallowTextExtractor(bufio.ScanWords)
} else if *splitPtr == "rune" {
ex = extractor.NewShallowTextExtractor(bufio.ScanRunes)
} else {
log.Fatal("Missing split argument")
}
} else {
log.Fatal("Missing split argument")
// Returns all text
ex = &extractor.AllTextExtractor{}
}
res, err := ex.Process(file)
if err != nil {
Expand Down
27 changes: 27 additions & 0 deletions pkg/extractor/alltext.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package extractor

import (
"bufio"
"io"
"strings"

boilertext "github.com/PageDash/boilertext/pkg"
)

// AllTextExtractor returns all text in the document.
type AllTextExtractor struct{}

// Process takes raw HTML as an input and returns all text within that HTML.
func (a *AllTextExtractor) Process(reader io.Reader) (string, error) {
blocks, err := boilertext.GenerateTextBlocks(reader, bufio.ScanWords)
if err != nil {
return "", err
}

var contentText string
for _, block := range blocks {
contentText += strings.TrimSpace(block.Content) + " "
}

return contentText, nil
}

0 comments on commit d14f394

Please sign in to comment.