From 0c082742cd654a621c68eeb30171a0b938fb7eec Mon Sep 17 00:00:00 2001 From: Patrick G Date: Sat, 16 Dec 2017 16:49:56 -0500 Subject: [PATCH] Faster conversion and cleaner code --- kepub/content.go | 72 +++++++++++++++++++++++++------------------ kepub/content_test.go | 20 ++++++------ 2 files changed, 53 insertions(+), 39 deletions(-) diff --git a/kepub/content.go b/kepub/content.go index 4f1f63d..071f5e2 100644 --- a/kepub/content.go +++ b/kepub/content.go @@ -169,10 +169,12 @@ func addSpans(doc *goquery.Document) error { return nil } -// openSelfClosingPs opens self-closing p tags. -func openSelfClosingPs(html *string) error { - re := regexp.MustCompile(`/]*/>`) - *html = re.ReplaceAllString(*html, `

`) +// addKoboStyles adds kobo styles. +func addKoboStyles(doc *goquery.Document) error { + s := doc.Find("head").First().AppendHtml(``) + if s.Length() != 1 { + return fmt.Errorf("could not append kobo styles") + } return nil } @@ -191,28 +193,34 @@ func smartenPunctuation(html *string) error { } // cleanHTML cleans up html for a kobo epub. -func cleanHTML(html *string) error { - emptyHeadingRe := regexp.MustCompile(`\s*`) - *html = emptyHeadingRe.ReplaceAllString(*html, "") - - msPRe := regexp.MustCompile(`\s*\s*<\/o:p>`) - *html = msPRe.ReplaceAllString(*html, " ") - - msStRe := regexp.MustCompile(`<\/?st1:\w+>`) - *html = msStRe.ReplaceAllString(*html, "") - - // unicode replacement chars - *html = strings.Replace(*html, "�", "", -1) +func cleanHTML(doc *goquery.Document) error { + // Remove Adobe DRM tags + doc.Find(`meta[name="Adept.expected.resource"]`).Remove() + + // Remove empty MS tags + doc.Find(`o\:p`).FilterFunction(func(_ int, s *goquery.Selection) bool { + return strings.Trim(s.Text(), "\t \n") == "" + }).Remove() + + // Remove empty headings + doc.Find(`h1,h2,h3,h4,h5,h6`).FilterFunction(func(_ int, s *goquery.Selection) bool { + return strings.Trim(s.Text(), "\t \n") == "" + }).Remove() + + // Remove MS tags + doc.Find(`*`).FilterFunction(func(_ int, s *goquery.Selection) bool { + return strings.HasPrefix(goquery.NodeName(s), "st1:") + }).Remove() + + // Open self closing p tags + doc.Find(`p`).Each(func(_ int, s *goquery.Selection) { + if s.Children().Length() == 0 && strings.Trim(s.Text(), "\n \t") == "" { + s.SetHtml("") + } + }) // Add type to style tags - *html = strings.Replace(*html, `", 1) + // Remove unicode replacement chars + h = strings.Replace(h, "�", "", -1) + + // Fix commented xml tag + h = strings.Replace(h, ``, ``, 1) + h = strings.Replace(h, ``, ``, 1) *content = h diff --git a/kepub/content_test.go b/kepub/content_test.go index 6f59668..125be54 100644 --- a/kepub/content_test.go +++ b/kepub/content_test.go @@ -13,9 +13,17 @@ import ( ) func TestCleanHTML(t *testing.T) { - h := `��

test

` - cleanHTML(&h) - assert.Equal(t, "

test

", h, "should be equal if cleaned correctly") + h := `

test

test

test

test

` + + doc, err := goquery.NewDocumentFromReader(strings.NewReader(h)) + assert.Nil(t, err, "err should be nil") + + cleanHTML(doc) + + nh, err := doc.Html() + assert.Nil(t, err, "err should be nil") + + assert.Equal(t, `

test

test

test

test

`, nh, "should be equal if cleaned correctly") } func TestSmartenPunctuation(t *testing.T) { @@ -24,12 +32,6 @@ func TestSmartenPunctuation(t *testing.T) { assert.Equal(t, " — – ", h, "should be equal if smartened correctly") } -func TestOpenSelfClosingPs(t *testing.T) { - h := `

test

test

` - openSelfClosingPs(&h) - assert.Equal(t, "

test

test

", h, "should be equal if reopened correctly") -} - func TestAddSpans(t *testing.T) { h := `