From 2dda2b12b4f70253da2fc380a383f61be798a024 Mon Sep 17 00:00:00 2001 From: Tim Bell Date: Wed, 11 Apr 2018 09:03:32 +1000 Subject: [PATCH 1/5] Speedup layout with .sort() and sortedcontainers.SortedListWithKey() --- pdfminer/layout.py | 29 ++++++++++++++--------------- setup.py | 2 +- tox.ini | 1 + 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 451d4e4d..b41498b7 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -1,9 +1,9 @@ +from sortedcontainers import SortedListWithKey from .utils import INF from .utils import Plane from .utils import get_bound from .utils import uniq -from .utils import csort from .utils import fsplit from .utils import bbox2str from .utils import matrix2str @@ -439,7 +439,7 @@ class LTTextBoxHorizontal(LTTextBox): def analyze(self, laparams): LTTextBox.analyze(self, laparams) - self._objs = csort(self._objs, key=lambda obj: -obj.y1) + self._objs.sort(key=lambda obj: -obj.y1) return def get_writing_mode(self): @@ -450,7 +450,7 @@ class LTTextBoxVertical(LTTextBox): def analyze(self, laparams): LTTextBox.analyze(self, laparams) - self._objs = csort(self._objs, key=lambda obj: -obj.x1) + self._objs.sort(key=lambda obj: -obj.y1) return def get_writing_mode(self): @@ -472,7 +472,7 @@ class LTTextGroupLRTB(LTTextGroup): def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-left to bottom-right. - self._objs = csort(self._objs, key=lambda obj: + self._objs.sort(key=lambda obj: (1-laparams.boxes_flow)*(obj.x0) - (1+laparams.boxes_flow)*(obj.y0+obj.y1)) return @@ -483,7 +483,7 @@ class LTTextGroupTBRL(LTTextGroup): def analyze(self, laparams): LTTextGroup.analyze(self, laparams) # reorder the objects from top-right to bottom-left. - self._objs = csort(self._objs, key=lambda obj: + self._objs.sort(key=lambda obj: -(1+laparams.boxes_flow)*(obj.x0+obj.x1) - (1-laparams.boxes_flow)*(obj.y1)) return @@ -637,21 +637,18 @@ def key_obj(t): (c,d,_,_) = t return (c,d) - # XXX this still takes O(n^2) :( - dists = [] + dists = SortedListWithKey(key=key_obj) for i in range(len(boxes)): obj1 = boxes[i] for j in range(i+1, len(boxes)): obj2 = boxes[j] - dists.append((0, dist(obj1, obj2), obj1, obj2)) - # We could use dists.sort(), but it would randomize the test result. - dists = csort(dists, key=key_obj) + dists.add((0, dist(obj1, obj2), obj1, obj2)) plane = Plane(self.bbox) plane.extend(boxes) while dists: (c, d, obj1, obj2) = dists.pop(0) if c == 0 and isany(obj1, obj2): - dists.append((1, d, obj1, obj2)) + dists.add((1, d, obj1, obj2)) continue if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))): @@ -660,11 +657,13 @@ def key_obj(t): group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) - dists = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists - if (obj1 in plane and obj2 in plane) ] + removed = {obj1, obj2} + to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists + if (obj1 in removed or obj2 in removed) ] + for r in to_remove: + dists.remove(r) for other in plane: - dists.append((0, dist(group, other), group, other)) - dists = csort(dists, key=key_obj) + dists.add((0, dist(group, other), group, other)) plane.add(group) assert len(plane) == 1, str(len(plane)) return list(plane) diff --git a/setup.py b/setup.py index 8eab391c..dd9db180 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ import pdfminer as package -requires = ['six', 'pycryptodome'] +requires = ['six', 'pycryptodome', 'sortedcontainers'] if sys.version_info >= (3, 0): requires.append('chardet') diff --git a/tox.ini b/tox.ini index cba10979..d0e167be 100644 --- a/tox.ini +++ b/tox.ini @@ -8,3 +8,4 @@ deps = pycryptodome chardet nose + sortedcontainers From 8f8a78bb88ee2811aef112f8484d308293265edb Mon Sep 17 00:00:00 2001 From: Tim Bell Date: Wed, 11 Apr 2018 09:37:32 +1000 Subject: [PATCH 2/5] Remove now-unused csort() --- pdfminer/utils.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pdfminer/utils.py b/pdfminer/utils.py index 9cbcbb3b..f2258b04 100644 --- a/pdfminer/utils.py +++ b/pdfminer/utils.py @@ -145,13 +145,6 @@ def uniq(objs): return -# csort -def csort(objs, key): - """Order-preserving sorting function.""" - idxs = dict((obj, i) for (i, obj) in enumerate(objs)) - return sorted(objs, key=lambda obj: (key(obj), idxs[obj])) - - # fsplit def fsplit(pred, objs): """Split a list into two classes according to the predicate.""" From 0c8cf748feeaf5466cf3b9744a6815c1fa0059b0 Mon Sep 17 00:00:00 2001 From: Tim Bell Date: Wed, 11 Apr 2018 10:15:32 +1000 Subject: [PATCH 3/5] Fix copy-paste error --- pdfminer/layout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index b41498b7..97f9749c 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -450,7 +450,7 @@ class LTTextBoxVertical(LTTextBox): def analyze(self, laparams): LTTextBox.analyze(self, laparams) - self._objs.sort(key=lambda obj: -obj.y1) + self._objs.sort(key=lambda obj: -obj.x1) return def get_writing_mode(self): From f87bd1f17145459b60eaa615f6367e33e23b93f8 Mon Sep 17 00:00:00 2001 From: Tim Bell Date: Wed, 11 Apr 2018 10:18:52 +1000 Subject: [PATCH 4/5] Add sortedcontainers to TravisCI config --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index d4107e0d..56d54234 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,5 +9,6 @@ install: - pip install six - pip install pycryptodome - pip install chardet + - pip install sortedcontainers script: nosetests --nologcapture From 1cbeaebfce3262915724098bed8369b5a62d93ae Mon Sep 17 00:00:00 2001 From: Tim Bell Date: Wed, 11 Apr 2018 10:34:15 +1000 Subject: [PATCH 5/5] Fix Python 2.6 incompatibility --- pdfminer/layout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfminer/layout.py b/pdfminer/layout.py index 97f9749c..0d47580a 100644 --- a/pdfminer/layout.py +++ b/pdfminer/layout.py @@ -657,7 +657,7 @@ def key_obj(t): group = LTTextGroupLRTB([obj1, obj2]) plane.remove(obj1) plane.remove(obj2) - removed = {obj1, obj2} + removed = [obj1, obj2] to_remove = [ (c,d,obj1,obj2) for (c,d,obj1,obj2) in dists if (obj1 in removed or obj2 in removed) ] for r in to_remove: