Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make PageText.sortPosition() sort order deterministic. #153

Merged
merged 9 commits into from
Aug 29, 2019
36 changes: 34 additions & 2 deletions extractor/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -1180,14 +1180,46 @@ var (
// sortPosition sorts a text list by its elements' positions on a page.
// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
// is horizontal.
// Text is considered to be on different lines if the lines' orientedStart.Y differs by more than `tol`.
func (pt *PageText) sortPosition(tol float64) {
if len(pt.marks) == 0 {
return
}

// For grouping data vertically into lines, it is necessary to have the data presorted by
// descending y position.
sort.SliceStable(pt.marks, func(i, j int) bool {
gunnsth marked this conversation as resolved.
Show resolved Hide resolved
ti, tj := pt.marks[i], pt.marks[j]
if ti.orient != tj.orient {
return ti.orient < tj.orient
}
return ti.orientedStart.Y >= tj.orientedStart.Y
})

// Cluster the marks into y-clusters by relative y proximity. Each cluster is our guess of what
// makes up a line of text.
clusters := make([]int, len(pt.marks))
cluster := 0
clusters[0] = cluster
for i := 1; i < len(pt.marks); i++ {
if pt.marks[i-1].orient != pt.marks[i].orient {
cluster++
} else {
if pt.marks[i-1].orientedStart.Y - pt.marks[i].orientedStart.Y > tol {
cluster++
}
}
clusters[i] = cluster
}

// Sort by y-cluster and x.
sort.SliceStable(pt.marks, func(i, j int) bool {
ti, tj := pt.marks[i], pt.marks[j]
if ti.orient != tj.orient {
return ti.orient < tj.orient
}
if math.Abs(ti.orientedStart.Y-tj.orientedStart.Y) > tol {
return ti.orientedStart.Y > tj.orientedStart.Y
if clusters[i] != clusters[j] {
return clusters[i] < clusters[j]
}
return ti.orientedStart.X < tj.orientedStart.X
})
Expand Down
48 changes: 48 additions & 0 deletions extractor/text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (

"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/creator"
"github.com/unidoc/unipdf/v3/internal/transform"
"github.com/unidoc/unipdf/v3/model"
"golang.org/x/text/unicode/norm"
)
Expand Down Expand Up @@ -180,6 +181,53 @@ func TestTermMarksFiles(t *testing.T) {
testTermMarksFiles(t)
}

// TestTextSort checks that PageText.sortPosition() gives expected results
func TestTextSort(t *testing.T) {
// marks0 is in the expected sort order for tol=15
marks0 := []textMark{
// y difference > tol => sorts by Y descending
textMark{orientedStart: transform.Point{X: 300, Y: 160}, text: "00"},
textMark{orientedStart: transform.Point{X: 200, Y: 140}, text: "01"},
textMark{orientedStart: transform.Point{X: 100, Y: 120}, text: "02"},

// y difference < tol => sort by X ascending for approx same Y
textMark{orientedStart: transform.Point{X: 100, Y: 30}, text: "10"},
textMark{orientedStart: transform.Point{X: 200, Y: 40}, text: "11"},
textMark{orientedStart: transform.Point{X: 300, Y: 50}, text: "12"},

// y difference < tol => sorts by X descending for approx same Y, different from previous Y
textMark{orientedStart: transform.Point{X: 100, Y: 3}, text: "20"},
textMark{orientedStart: transform.Point{X: 200, Y: 4}, text: "21"},
textMark{orientedStart: transform.Point{X: 300, Y: 5}, text: "22"},
}

// marks is a copy of marks0 with its order scrambled.
marks := make([]textMark, len(marks0))
copy(marks, marks0)
sort.Slice(marks, func(i, j int) bool {
ti, tj := marks[i], marks[j]
if ti.orientedStart.X != tj.orientedStart.X {
return ti.orientedStart.X > tj.orientedStart.X
}
if ti.orient != tj.orient {
return ti.orient > tj.orient
}
return ti.orientedStart.Y < tj.orientedStart.Y
})

// Copy marks to PageText and sort them. This should give the same order as marks0.
pt := PageText{marks: marks}
pt.sortPosition(15)

// Check that marks order is the same as marks0.
for i, m0 := range marks0 {
m := pt.marks[i]
if m.orientedStart.X != m0.orientedStart.X || m.orientedStart.Y != m0.orientedStart.Y {
t.Fatalf("i=%d m=%v != m0=%v", i, m, m0)
}
}
}

// fileExtractionTests are PDF file names and terms we expect to find on specified pages of those
// PDF files.
// `pageTerms`[pageNum] are the terms we expect to find on (1-offset) page number pageNum of
Expand Down