Skip to content

Commit

Permalink
iss #74: strip annotation size on fb2 metadata parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
maizy committed Oct 8, 2023
1 parent 260eff3 commit 2b70fa1
Show file tree
Hide file tree
Showing 4 changed files with 110 additions and 9 deletions.
37 changes: 28 additions & 9 deletions internal/fb2/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"strconv"
"strings"
"time"
"unicode/utf8"

"github.com/antchfx/xmlquery"

Expand All @@ -16,6 +17,8 @@ import (
"dev.maizy.ru/ponylib/internal/u"
)

const annotationMaxChars = 8000

func ScanBookMetadata(source io.Reader) (*fb2_parser.Fb2Metadata, error) {
xmlParser, err := xmlquery.CreateStreamParser(source,
"/FictionBook/description/title-info|"+
Expand Down Expand Up @@ -152,7 +155,7 @@ func ScanBookMetadata(source io.Reader) (*fb2_parser.Fb2Metadata, error) {
}

if annotationNode := xmlquery.FindOne(titleInfoNode, "//annotation"); annotationNode != nil {
annotation = u.StrPtr(fb2MarkupToText(annotationNode))
annotation = u.StrPtr(fb2MarkupToText(annotationNode, annotationMaxChars))
}
}

Expand Down Expand Up @@ -228,8 +231,7 @@ func parseDate(parentNode *xmlquery.Node) (formatted *string, parsed *time.Time)
return
}

func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, depth int) []string {
var result []string
func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, charsLimit int, depth int) {
var iterateChildren = false
postfix := ""
ifTagMatched := func(regex string) bool {
Expand All @@ -238,6 +240,7 @@ func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, depth int) []strin
}
switch node.Type {
case xmlquery.ElementNode:
//println("xml> " + strings.Repeat(" ", depth) + node.Data)
switch node.Data {
// text blocks
case "p", "ul", "blockquote", "poem", "stanza", "epigraph":
Expand Down Expand Up @@ -296,26 +299,42 @@ func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, depth int) []strin
case xmlquery.TextNode:
fallthrough
case xmlquery.CharDataNode:
//println("xml> " + strings.Repeat(" ", depth) + "text/cdata")
if text := node.InnerText(); !isOnlyWhitespaces(text) {
sb.WriteString(collapseWhitespaces(text))
}
}

if iterateChildren && depth < 100 {
if iterateChildren && depth < 100 && !isCharsLimitReached(sb, charsLimit) {
for child := node.FirstChild; child != nil; child = child.NextSibling {
fb2ToTextInner(child, sb, depth+1)
fb2ToTextInner(child, sb, charsLimit, depth+1)
if isCharsLimitReached(sb, charsLimit) {
break
}
}
}
sb.WriteString(postfix)
return result
}

func fb2MarkupToText(root *xmlquery.Node) string {
func isCharsLimitReached(sb *strings.Builder, charsLimit int) bool {
// assume all runes have size of 1 byte (but it's not)
// TODO: how to count size in runes effectively
return charsLimit > -1 && sb.Len() >= charsLimit
}

func fb2MarkupToText(root *xmlquery.Node, charsLimit int) string {
var sb strings.Builder
for child := root.FirstChild; child != nil; child = child.NextSibling {
fb2ToTextInner(child, &sb, 0)
fb2ToTextInner(child, &sb, charsLimit, 0)
if isCharsLimitReached(&sb, charsLimit) {
break
}
}
finalTextAsStr := strings.TrimSpace(normalizeXmlText(sb.String()))
if utf8.RuneCountInString(finalTextAsStr) > charsLimit {
finalTextAsStr = finalTextAsStr[:charsLimit]
}
return strings.TrimSpace(normalizeXmlText(sb.String()))
return finalTextAsStr
}

func findText(node *xmlquery.Node, query string) *string {
Expand Down
54 changes: 54 additions & 0 deletions internal/fb2/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"testing"
"time"

"github.com/antchfx/xmlquery"
"github.com/stretchr/testify/assert"

"dev.maizy.ru/ponylib/fb2_parser"
Expand All @@ -24,6 +25,20 @@ func openTestBook(name string) io.Reader {
return file
}

func loadTestXmlStream(name string, rootNone string) *xmlquery.Node {
xmlParser, err := xmlquery.CreateStreamParser(openTestBook(name),
"/"+rootNone)
if err != nil {
panic(err)
}

node, err := xmlParser.Read()
if err != nil {
panic(err)
}
return node
}

func longText(text string) *string {
noTabs := regexp.MustCompile("(?m)^\t+").ReplaceAllString(text, "")
noNL := regexp.MustCompile("(?m)\n").ReplaceAllString(noTabs, " ")
Expand Down Expand Up @@ -152,3 +167,42 @@ func Test_normalizeTextPtr(t *testing.T) {
a := assert.New(t)
a.Nil(normalizeXmlTextPtr(nil))
}

func Test_fb2MarkupToText_longTextFirstLevel(t *testing.T) {
node := loadTestXmlStream("long-text-first-level.xml", "annotation")
got := fb2MarkupToText(node, 8000)
a := assert.New(t)
a.Len(got, 8000)

newLineOccurence := strings.Count(got, "\n")
a.EqualValues(2, newLineOccurence)

lines := strings.Split(got, "\n")
a.Regexp("^3K text 1", lines[0])
a.Len(lines[0], 3000)
a.Regexp("^3K text 2", lines[1])
a.Len(lines[1], 3000)
a.Regexp("^3K text 3", lines[2])
a.Len(lines[2], 8000-3000-3000-2)
}

func Test_fb2MarkupToText_longTextDeepLevel(t *testing.T) {
node := loadTestXmlStream("long-text-deep-level.xml", "annotation")
got := fb2MarkupToText(node, 8000)
println(got)

a := assert.New(t)
a.Len(got, 8000)

newLineOccurence := strings.Count(got, "\n")
a.EqualValues(3, newLineOccurence)

lines := strings.Split(got, "\n")
a.EqualValues("short text", lines[0])
a.Regexp(`^\* 3K text 1`, lines[1])
a.Len(lines[1], 3002)
a.Regexp(`^\* 3K text 2`, lines[2])
a.Len(lines[2], 3002)
a.Regexp(`^\* 3K text 3`, lines[3])
a.Len(lines[3], 8000-3002-3002-3-len("short text"))
}
Loading

0 comments on commit 2b70fa1

Please sign in to comment.