iss #74: strip annotation size on fb2 metadata parsing

maizy · Oct 8, 2023 · 2b70fa1 · 2b70fa1
1 parent 260eff3
commit 2b70fa1
Show file tree

Hide file tree

Showing 4 changed files with 110 additions and 9 deletions.
diff --git a/internal/fb2/parser.go b/internal/fb2/parser.go
@@ -8,6 +8,7 @@ import (
 	"strconv"
 	"strings"
 	"time"
+	"unicode/utf8"
 
 	"github.com/antchfx/xmlquery"
 
@@ -16,6 +17,8 @@ import (
 	"dev.maizy.ru/ponylib/internal/u"
 )
 
+const annotationMaxChars = 8000
+
 func ScanBookMetadata(source io.Reader) (*fb2_parser.Fb2Metadata, error) {
 	xmlParser, err := xmlquery.CreateStreamParser(source,
 		"/FictionBook/description/title-info|"+
@@ -152,7 +155,7 @@ func ScanBookMetadata(source io.Reader) (*fb2_parser.Fb2Metadata, error) {
 		}
 
 		if annotationNode := xmlquery.FindOne(titleInfoNode, "//annotation"); annotationNode != nil {
-			annotation = u.StrPtr(fb2MarkupToText(annotationNode))
+			annotation = u.StrPtr(fb2MarkupToText(annotationNode, annotationMaxChars))
 		}
 	}
 
@@ -228,8 +231,7 @@ func parseDate(parentNode *xmlquery.Node) (formatted *string, parsed *time.Time)
 	return
 }
 
-func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, depth int) []string {
-	var result []string
+func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, charsLimit int, depth int) {
 	var iterateChildren = false
 	postfix := ""
 	ifTagMatched := func(regex string) bool {
@@ -238,6 +240,7 @@ func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, depth int) []strin
 	}
 	switch node.Type {
 	case xmlquery.ElementNode:
+		//println("xml> " + strings.Repeat("  ", depth) + node.Data)
 		switch node.Data {
 		// text blocks
 		case "p", "ul", "blockquote", "poem", "stanza", "epigraph":
@@ -296,26 +299,42 @@ func fb2ToTextInner(node *xmlquery.Node, sb *strings.Builder, depth int) []strin
 	case xmlquery.TextNode:
 		fallthrough
 	case xmlquery.CharDataNode:
+		//println("xml> " + strings.Repeat("  ", depth) + "text/cdata")
 		if text := node.InnerText(); !isOnlyWhitespaces(text) {
 			sb.WriteString(collapseWhitespaces(text))
 		}
 	}
 
-	if iterateChildren && depth < 100 {
+	if iterateChildren && depth < 100 && !isCharsLimitReached(sb, charsLimit) {
 		for child := node.FirstChild; child != nil; child = child.NextSibling {
-			fb2ToTextInner(child, sb, depth+1)
+			fb2ToTextInner(child, sb, charsLimit, depth+1)
+			if isCharsLimitReached(sb, charsLimit) {
+				break
+			}
 		}
 	}
 	sb.WriteString(postfix)
-	return result
 }
 
-func fb2MarkupToText(root *xmlquery.Node) string {
+func isCharsLimitReached(sb *strings.Builder, charsLimit int) bool {
+	// assume all runes have size of 1 byte (but it's not)
+	// TODO: how to count size in runes effectively
+	return charsLimit > -1 && sb.Len() >= charsLimit
+}
+
+func fb2MarkupToText(root *xmlquery.Node, charsLimit int) string {
 	var sb strings.Builder
 	for child := root.FirstChild; child != nil; child = child.NextSibling {
-		fb2ToTextInner(child, &sb, 0)
+		fb2ToTextInner(child, &sb, charsLimit, 0)
+		if isCharsLimitReached(&sb, charsLimit) {
+			break
+		}
+	}
+	finalTextAsStr := strings.TrimSpace(normalizeXmlText(sb.String()))
+	if utf8.RuneCountInString(finalTextAsStr) > charsLimit {
+		finalTextAsStr = finalTextAsStr[:charsLimit]
 	}
-	return strings.TrimSpace(normalizeXmlText(sb.String()))
+	return finalTextAsStr
 }
 
 func findText(node *xmlquery.Node, query string) *string {

diff --git a/internal/fb2/parser_test.go b/internal/fb2/parser_test.go
@@ -10,6 +10,7 @@ import (
 	"testing"
 	"time"
 
+	"github.com/antchfx/xmlquery"
 	"github.com/stretchr/testify/assert"
 
 	"dev.maizy.ru/ponylib/fb2_parser"
@@ -24,6 +25,20 @@ func openTestBook(name string) io.Reader {
 	return file
 }
 
+func loadTestXmlStream(name string, rootNone string) *xmlquery.Node {
+	xmlParser, err := xmlquery.CreateStreamParser(openTestBook(name),
+		"/"+rootNone)
+	if err != nil {
+		panic(err)
+	}
+
+	node, err := xmlParser.Read()
+	if err != nil {
+		panic(err)
+	}
+	return node
+}
+
 func longText(text string) *string {
 	noTabs := regexp.MustCompile("(?m)^\t+").ReplaceAllString(text, "")
 	noNL := regexp.MustCompile("(?m)\n").ReplaceAllString(noTabs, " ")
@@ -152,3 +167,42 @@ func Test_normalizeTextPtr(t *testing.T) {
 	a := assert.New(t)
 	a.Nil(normalizeXmlTextPtr(nil))
 }
+
+func Test_fb2MarkupToText_longTextFirstLevel(t *testing.T) {
+	node := loadTestXmlStream("long-text-first-level.xml", "annotation")
+	got := fb2MarkupToText(node, 8000)
+	a := assert.New(t)
+	a.Len(got, 8000)
+
+	newLineOccurence := strings.Count(got, "\n")
+	a.EqualValues(2, newLineOccurence)
+
+	lines := strings.Split(got, "\n")
+	a.Regexp("^3K text 1", lines[0])
+	a.Len(lines[0], 3000)
+	a.Regexp("^3K text 2", lines[1])
+	a.Len(lines[1], 3000)
+	a.Regexp("^3K text 3", lines[2])
+	a.Len(lines[2], 8000-3000-3000-2)
+}
+
+func Test_fb2MarkupToText_longTextDeepLevel(t *testing.T) {
+	node := loadTestXmlStream("long-text-deep-level.xml", "annotation")
+	got := fb2MarkupToText(node, 8000)
+	println(got)
+
+	a := assert.New(t)
+	a.Len(got, 8000)
+
+	newLineOccurence := strings.Count(got, "\n")
+	a.EqualValues(3, newLineOccurence)
+
+	lines := strings.Split(got, "\n")
+	a.EqualValues("short text", lines[0])
+	a.Regexp(`^\* 3K text 1`, lines[1])
+	a.Len(lines[1], 3002)
+	a.Regexp(`^\* 3K text 2`, lines[2])
+	a.Len(lines[2], 3002)
+	a.Regexp(`^\* 3K text 3`, lines[3])
+	a.Len(lines[3], 8000-3002-3002-3-len("short text"))
+}