Skip to content

Commit

Permalink
ocr: normalize capitalization
Browse files Browse the repository at this point in the history
  • Loading branch information
martinlindhe committed Oct 14, 2017
1 parent 578c6a2 commit 198a1cb
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 0 deletions.
36 changes: 36 additions & 0 deletions filter_ocr.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package subtitles

import (
"strings"
"unicode"

log "github.com/Sirupsen/logrus"
)
Expand Down Expand Up @@ -32,10 +33,45 @@ func (subtitle *Subtitle) filterOCR() *Subtitle {
cap.Text[i] = strings.Replace(cap.Text[i], strings.Title(bad), strings.Title(good), -1)
}

cap.Text[i] = fixOCRLineCapitalization(cap.Text[i])
if org != cap.Text[i] {
log.Println("[ocr]", org, "->", cap.Text[i])
}
}
}
return subtitle
}

func fixOCRLineCapitalization(s string) string {
words := strings.Split(s, " ")
for i := range words {
words[i] = fixOCRWordCapitalization(words[i])
}
return strings.Join(words, " ")
}

// fix capitalization errors due to ocr, GAsPs => GASPS
func fixOCRWordCapitalization(s string) string {
if len(s) <= 3 {
return s
}

// if starts with uc, or at least 2 letters is upper, make all upper
upper := 0
ucStart := false
for i, char := range s {
if i == 0 && unicode.IsUpper(char) {
ucStart = true
}
if unicode.IsUpper(char) {
upper++
}
}
if upper >= 2 {
return strings.ToUpper(s)
}
if ucStart {
return strings.Title(s)
}
return strings.ToLower(s)
}
19 changes: 19 additions & 0 deletions filter_ocr_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,22 @@ func TestFilterOCREnglish(t *testing.T) {

assert.Equal(t, &expected, in.filterOCR())
}

func TestFilterOCRCapitalization(t *testing.T) {

in := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"GAsPs slowly"},
}}}

expected := Subtitle{[]Caption{{
1,
makeTime(0, 0, 4, 630),
makeTime(0, 0, 6, 18),
[]string{"GASPS slowly"},
}}}

assert.Equal(t, &expected, in.filterOCR())
}

0 comments on commit 198a1cb

Please sign in to comment.