Skip to content

Commit

Permalink
Speedup line_offset property (#1392)
Browse files Browse the repository at this point in the history
* Replace dynamic regex with string find operation
* Add cache of where each line starts so we don't have quadratic behavior identifying line numbers when importing large chunks of html
  • Loading branch information
eanorige committed Oct 26, 2023
1 parent 4f0b91a commit 524e4da
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 9 deletions.
6 changes: 6 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). See the [Contributing Guide](contributing.md) for details.

## [unreleased]

### Fixed

* Fix a performance problem with HTML extraction where large HTML input could trigger quadratic line counting behavior (PR#1392).

## [3.5] -- 2023-10-06

### Added
Expand Down
22 changes: 13 additions & 9 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ def __init__(self, md, *args, **kwargs):
# Block tags that should contain no content (self closing)
self.empty_tags = set(['hr'])

self.lineno_start_cache = [0]

# This calls self.reset
super().__init__(*args, **kwargs)
self.md = md
Expand All @@ -94,6 +96,8 @@ def reset(self):
self.stack = [] # When `inraw==True`, stack contains a list of tags
self._cache = []
self.cleandoc = []
self.lineno_start_cache = [0]

super().reset()

def close(self):
Expand All @@ -114,15 +118,15 @@ def close(self):
@property
def line_offset(self) -> int:
"""Returns char index in `self.rawdata` for the start of the current line. """
if self.lineno > 1 and '\n' in self.rawdata:
m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
if m:
return m.end()
else: # pragma: no cover
# Value of `self.lineno` must exceed total number of lines.
# Find index of beginning of last line.
return self.rawdata.rfind('\n')
return 0
for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
last_line_start_pos = self.lineno_start_cache[ii]
lf_pos = self.rawdata.find('\n', last_line_start_pos)
if lf_pos == -1:
# No more newlines found. Use end of raw data as start of line beyond end.
lf_pos = len(self.rawdata)
self.lineno_start_cache.append(lf_pos+1)

return self.lineno_start_cache[self.lineno-1]

def at_line_start(self) -> bool:
"""
Expand Down

0 comments on commit 524e4da

Please sign in to comment.