Skip to content

Commit

Permalink
Faster conversion and cleaner code
Browse files Browse the repository at this point in the history
  • Loading branch information
pgaskin committed Dec 16, 2017
1 parent 950cd0e commit 0c08274
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 39 deletions.
72 changes: 42 additions & 30 deletions kepub/content.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,10 +169,12 @@ func addSpans(doc *goquery.Document) error {
return nil
}

// openSelfClosingPs opens self-closing p tags.
func openSelfClosingPs(html *string) error {
re := regexp.MustCompile(`<p[^>/]*/>`)
*html = re.ReplaceAllString(*html, `<p></p>`)
// addKoboStyles adds kobo styles.
func addKoboStyles(doc *goquery.Document) error {
s := doc.Find("head").First().AppendHtml(`<style type="text/css">div#book-inner{margin-top: 0;margin-bottom: 0;}</style>`)
if s.Length() != 1 {
return fmt.Errorf("could not append kobo styles")
}
return nil
}

Expand All @@ -191,28 +193,34 @@ func smartenPunctuation(html *string) error {
}

// cleanHTML cleans up html for a kobo epub.
func cleanHTML(html *string) error {
emptyHeadingRe := regexp.MustCompile(`<h\d+>\s*</h\d+>`)
*html = emptyHeadingRe.ReplaceAllString(*html, "")

msPRe := regexp.MustCompile(`\s*<o:p>\s*<\/o:p>`)
*html = msPRe.ReplaceAllString(*html, " ")

msStRe := regexp.MustCompile(`<\/?st1:\w+>`)
*html = msStRe.ReplaceAllString(*html, "")

// unicode replacement chars
*html = strings.Replace(*html, "�", "", -1)
func cleanHTML(doc *goquery.Document) error {
// Remove Adobe DRM tags
doc.Find(`meta[name="Adept.expected.resource"]`).Remove()

// Remove empty MS <o:p> tags
doc.Find(`o\:p`).FilterFunction(func(_ int, s *goquery.Selection) bool {
return strings.Trim(s.Text(), "\t \n") == ""
}).Remove()

// Remove empty headings
doc.Find(`h1,h2,h3,h4,h5,h6`).FilterFunction(func(_ int, s *goquery.Selection) bool {
return strings.Trim(s.Text(), "\t \n") == ""
}).Remove()

// Remove MS <st1:whatever> tags
doc.Find(`*`).FilterFunction(func(_ int, s *goquery.Selection) bool {
return strings.HasPrefix(goquery.NodeName(s), "st1:")
}).Remove()

// Open self closing p tags
doc.Find(`p`).Each(func(_ int, s *goquery.Selection) {
if s.Children().Length() == 0 && strings.Trim(s.Text(), "\n \t") == "" {
s.SetHtml("")
}
})

// Add type to style tags
*html = strings.Replace(*html, `<style>`, `<style type="text/css">`, -1)

// ADEPT drm tags
adeptRe := regexp.MustCompile(`(<meta\s+content=".+"\s+name="Adept.expected.resource"\s+\/>)`)
*html = adeptRe.ReplaceAllString(*html, "")

// Fix commented xml tag
*html = strings.Replace(*html, `<!-- ?xml version="1.0" encoding="utf-8"? -->`, `<?xml version="1.0" encoding="utf-8"?>`, 1)
doc.Find(`style`).SetAttr("type", "text/css")

return nil
}
Expand All @@ -232,25 +240,29 @@ func process(content *string) error {
return err
}

h, err := doc.Html()
if err != nil {
if err := addKoboStyles(doc); err != nil {
return err
}

if err := openSelfClosingPs(&h); err != nil {
if err := cleanHTML(doc); err != nil {
return err
}

if err := cleanHTML(&h); err != nil {
h, err := doc.Html()
if err != nil {
return err
}

if err := smartenPunctuation(&h); err != nil {
return err
}

// Kobo style fixes
h = strings.Replace(h, "</head>", "<style type=\"text/css\">div#book-inner{margin-top: 0;margin-bottom: 0;}</style></head>", 1)
// Remove unicode replacement chars
h = strings.Replace(h, "�", "", -1)

// Fix commented xml tag
h = strings.Replace(h, `<!-- ?xml version="1.0" encoding="utf-8"? -->`, `<?xml version="1.0" encoding="utf-8"?>`, 1)
h = strings.Replace(h, `<!--?xml version="1.0" encoding="utf-8"?-->`, `<?xml version="1.0" encoding="utf-8"?>`, 1)

*content = h

Expand Down
20 changes: 11 additions & 9 deletions kepub/content_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,17 @@ import (
)

func TestCleanHTML(t *testing.T) {
h := `<meta content="urn:uuid:asd--asdasd-asdasdas-dasdasd234234" name="Adept.expected.resource" />��<st1:asd></st1:asd><o:p></o:p><h1></h1><h3></h3><h2>test</h2><style></style>`
cleanHTML(&h)
assert.Equal(t, " <h2>test</h2><style type=\"text/css\"></style>", h, "should be equal if cleaned correctly")
h := `<html><head></head><body><p /><p>test</p><p /><p /><p>test</p><meta content="urn:uuid:asd--asdasd-asdasdas-dasdasd234234" name="Adept.expected.resource" /><st1:asd></st1:asd><o:p></o:p><h1></h1><h3></h3><h2>test</h2><p>test</p><style></style></body></html>`

doc, err := goquery.NewDocumentFromReader(strings.NewReader(h))
assert.Nil(t, err, "err should be nil")

cleanHTML(doc)

nh, err := doc.Html()
assert.Nil(t, err, "err should be nil")

assert.Equal(t, `<html><head></head><body><p></p><p>test</p><p></p><p></p><p>test</p><h2>test</h2><p>test</p><style type="text/css"></style></body></html>`, nh, "should be equal if cleaned correctly")
}

func TestSmartenPunctuation(t *testing.T) {
Expand All @@ -24,12 +32,6 @@ func TestSmartenPunctuation(t *testing.T) {
assert.Equal(t, " &#x2014; &#x2013; <!-- test -->", h, "should be equal if smartened correctly")
}

func TestOpenSelfClosingPs(t *testing.T) {
h := `<p>test</p><p /><p /><p>test</p>`
openSelfClosingPs(&h)
assert.Equal(t, "<p>test</p><p></p><p></p><p>test</p>", h, "should be equal if reopened correctly")
}

func TestAddSpans(t *testing.T) {
h := `<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
Expand Down

0 comments on commit 0c08274

Please sign in to comment.