Skip to content

Commit

Permalink
Replace Qt RegEx with Python RegEx (#2028)
Browse files Browse the repository at this point in the history
  • Loading branch information
vkbo authored Sep 22, 2024
2 parents 1aa58c0 + 17b4a7b commit 5c07148
Show file tree
Hide file tree
Showing 9 changed files with 155 additions and 143 deletions.
1 change: 1 addition & 0 deletions novelwriter/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ class nwConst:

class nwRegEx:

WORDS = r"\b[^\s\-\+\/–—\[\]:]+\b"
FMT_EI = r"(?<![\w\\])(_)(?![\s_])(.+?)(?<![\s\\])(\1)(?!\w)"
FMT_EB = r"(?<![\w\\])(\*{2})(?![\s\*])(.+?)(?<![\s\\])(\1)(?!\w)"
FMT_ST = r"(?<![\w\\])(~{2})(?![\s~])(.+?)(?<![\s\\])(\1)(?!\w)"
Expand Down
26 changes: 12 additions & 14 deletions novelwriter/core/coretools.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,15 @@
from __future__ import annotations

import logging
import re
import shutil

from collections.abc import Iterable
from functools import partial
from pathlib import Path
from zipfile import ZipFile, is_zipfile

from PyQt5.QtCore import QCoreApplication, QRegularExpression
from PyQt5.QtCore import QCoreApplication

from novelwriter import CONFIG, SHARED
from novelwriter.common import isHandle, minmax, simplified
Expand Down Expand Up @@ -297,8 +298,8 @@ def duplicate(self, items: list[str]) -> Iterable[tuple[str, str | None]]:
class DocSearch:

def __init__(self) -> None:
self._regEx = QRegularExpression()
self.setCaseSensitive(False)
self._regEx = re.compile("")
self._opts = re.UNICODE | re.IGNORECASE
self._words = False
self._escape = True
return
Expand All @@ -309,10 +310,9 @@ def __init__(self) -> None:

def setCaseSensitive(self, state: bool) -> None:
"""Set the case sensitive search flag."""
opts = QRegularExpression.PatternOption.UseUnicodePropertiesOption
self._opts = re.UNICODE
if not state:
opts |= QRegularExpression.PatternOption.CaseInsensitiveOption
self._regEx.setPatternOptions(opts)
self._opts |= re.IGNORECASE
return

def setWholeWords(self, state: bool) -> None:
Expand All @@ -329,8 +329,8 @@ def iterSearch(
self, project: NWProject, search: str
) -> Iterable[tuple[NWItem, list[tuple[int, int, str]], bool]]:
"""Iteratively search through documents in a project."""
self._regEx.setPattern(self._buildPattern(search))
logger.debug("Searching with pattern '%s'", self._regEx.pattern())
self._regEx = re.compile(self._buildPattern(search), self._opts)
logger.debug("Searching with pattern '%s'", self._regEx.pattern)
storage = project.storage
for item in project.tree:
if item.isFileType():
Expand All @@ -340,14 +340,12 @@ def iterSearch(

def searchText(self, text: str) -> tuple[list[tuple[int, int, str]], bool]:
"""Search a piece of text for RegEx matches."""
rxItt = self._regEx.globalMatch(text)
count = 0
capped = False
results = []
while rxItt.hasNext():
rxMatch = rxItt.next()
pos = rxMatch.capturedStart()
num = rxMatch.capturedLength()
for match in re.finditer(self._regEx, text):
pos = match.start(0)
num = len(match.group(0))
lim = text[:pos].rfind("\n") + 1
cut = text[lim:pos].rfind(" ") + lim + 1
context = text[cut:cut+100].partition("\n")[0]
Expand All @@ -366,7 +364,7 @@ def searchText(self, text: str) -> tuple[list[tuple[int, int, str]], bool]:
def _buildPattern(self, search: str) -> str:
"""Build the search pattern string."""
if self._escape:
search = QRegularExpression.escape(search)
search = re.escape(search)
if self._words:
search = f"(?:^|\\b){search}(?:$|\\b)"
return search
Expand Down
46 changes: 18 additions & 28 deletions novelwriter/core/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
from pathlib import Path
from time import time

from PyQt5.QtCore import QCoreApplication, QRegularExpression
from PyQt5.QtCore import QCoreApplication
from PyQt5.QtGui import QFont

from novelwriter import CONFIG
Expand Down Expand Up @@ -234,7 +234,7 @@ def __init__(self, project: NWProject) -> None:
nwShortcode.FOOTNOTE_B: self.FMT_FNOTE,
}

self._rxDialogue: list[tuple[QRegularExpression, int, int]] = []
self._rxDialogue: list[tuple[re.Pattern, int, int]] = []

return

Expand Down Expand Up @@ -1109,55 +1109,45 @@ def _extractFormats(

# Match Markdown
for regEx, fmts in self._rxMarkdown:
rxItt = regEx.globalMatch(text, 0)
while rxItt.hasNext():
rxMatch = rxItt.next()
for match in re.finditer(regEx, text):
temp.extend(
(rxMatch.capturedStart(n), rxMatch.capturedLength(n), fmt, "")
(match.start(n), match.end(n), fmt, "")
for n, fmt in enumerate(fmts) if fmt > 0
)

# Match Shortcodes
rxItt = self._rxShortCodes.globalMatch(text, 0)
while rxItt.hasNext():
rxMatch = rxItt.next()
for match in re.finditer(REGEX_PATTERNS.shortcodePlain, text):
temp.append((
rxMatch.capturedStart(1),
rxMatch.capturedLength(1),
self._shortCodeFmt.get(rxMatch.captured(1).lower(), 0),
match.start(1), match.end(1),
self._shortCodeFmt.get(match.group(1).lower(), 0),
"",
))

# Match Shortcode w/Values
rxItt = self._rxShortCodeVals.globalMatch(text, 0)
tHandle = self._handle or ""
while rxItt.hasNext():
rxMatch = rxItt.next()
kind = self._shortCodeVals.get(rxMatch.captured(1).lower(), 0)
for match in re.finditer(REGEX_PATTERNS.shortcodeValue, text):
kind = self._shortCodeVals.get(match.group(1).lower(), 0)
temp.append((
rxMatch.capturedStart(0),
rxMatch.capturedLength(0),
match.start(0), match.end(0),
self.FMT_STRIP if kind == skip else kind,
f"{tHandle}:{rxMatch.captured(2)}",
f"{tHandle}:{match.group(2)}",
))

# Match Dialogue
if self._rxDialogue and hDialog:
for regEx, fmtB, fmtE in self._rxDialogue:
rxItt = regEx.globalMatch(text, 0)
while rxItt.hasNext():
rxMatch = rxItt.next()
temp.append((rxMatch.capturedStart(0), 0, fmtB, ""))
temp.append((rxMatch.capturedEnd(0), 0, fmtE, ""))
for match in re.finditer(regEx, text):
temp.append((match.start(0), 0, fmtB, ""))
temp.append((match.end(0), 0, fmtE, ""))

# Post-process text and format
result = text
formats = []
for pos, n, fmt, key in reversed(sorted(temp, key=lambda x: x[0])):
for pos, end, fmt, key in reversed(sorted(temp, key=lambda x: x[0])):
if fmt > 0:
if n > 0:
result = result[:pos] + result[pos+n:]
formats = [(p-n if p > pos else p, f, k) for p, f, k in formats]
if end > pos:
result = result[:pos] + result[end:]
formats = [(p+pos-end if p > pos else p, f, k) for p, f, k in formats]
formats.insert(0, (pos, fmt, key))

return result, formats
Expand Down
75 changes: 32 additions & 43 deletions novelwriter/gui/dochighlight.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,28 @@
from __future__ import annotations

import logging
import re

from time import time

from PyQt5.QtCore import QRegularExpression, Qt
from PyQt5.QtCore import Qt
from PyQt5.QtGui import (
QBrush, QColor, QFont, QSyntaxHighlighter, QTextBlockUserData,
QTextCharFormat, QTextDocument
)

from novelwriter import CONFIG, SHARED
from novelwriter.common import checkInt
from novelwriter.constants import nwHeaders, nwRegEx, nwUnicode
from novelwriter.constants import nwHeaders, nwUnicode
from novelwriter.core.index import processComment
from novelwriter.enum import nwComment
from novelwriter.text.patterns import REGEX_PATTERNS
from novelwriter.types import QRegExUnicode

logger = logging.getLogger(__name__)

SPELLRX = QRegularExpression(r"\b[^\s\-\+\/–—\[\]:]+\b")
SPELLRX.setPatternOptions(QRegExUnicode)
SPELLSC = QRegularExpression(nwRegEx.FMT_SC)
SPELLSC.setPatternOptions(QRegExUnicode)
SPELLSV = QRegularExpression(nwRegEx.FMT_SV)
SPELLSV.setPatternOptions(QRegExUnicode)
RX_WORDS = REGEX_PATTERNS.wordSplit
RX_FMT_SC = REGEX_PATTERNS.shortcodePlain
RX_FMT_SV = REGEX_PATTERNS.shortcodeValue

BLOCK_NONE = 0
BLOCK_TEXT = 1
Expand All @@ -76,9 +73,9 @@ def __init__(self, document: QTextDocument) -> None:
self._spellErr = QTextCharFormat()

self._hStyles: dict[str, QTextCharFormat] = {}
self._minRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = []
self._txtRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = []
self._cmnRules: list[tuple[QRegularExpression, dict[int, QTextCharFormat]]] = []
self._minRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = []
self._txtRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = []
self._cmnRules: list[tuple[re.Pattern, dict[int, QTextCharFormat]]] = []

self.initHighlighter()

Expand Down Expand Up @@ -135,8 +132,7 @@ def initHighlighter(self) -> None:

# Multiple or Trailing Spaces
if CONFIG.showMultiSpaces:
rxRule = QRegularExpression(r"[ ]{2,}|[ ]*$")
rxRule.setPatternOptions(QRegExUnicode)
rxRule = re.compile(r"[ ]{2,}|[ ]*$", re.UNICODE)
hlRule = {
0: self._hStyles["mspaces"],
}
Expand All @@ -145,8 +141,7 @@ def initHighlighter(self) -> None:
self._cmnRules.append((rxRule, hlRule))

# Non-Breaking Spaces
rxRule = QRegularExpression(f"[{nwUnicode.U_NBSP}{nwUnicode.U_THNBSP}]+")
rxRule.setPatternOptions(QRegExUnicode)
rxRule = re.compile(f"[{nwUnicode.U_NBSP}{nwUnicode.U_THNBSP}]+", re.UNICODE)
hlRule = {
0: self._hStyles["nobreak"],
}
Expand Down Expand Up @@ -237,17 +232,15 @@ def initHighlighter(self) -> None:
self._cmnRules.append((rxRule, hlRule))

# Alignment Tags
rxRule = QRegularExpression(r"(^>{1,2}|<{1,2}$)")
rxRule.setPatternOptions(QRegExUnicode)
rxRule = re.compile(r"(^>{1,2}|<{1,2}$)", re.UNICODE)
hlRule = {
1: self._hStyles["markup"],
}
self._minRules.append((rxRule, hlRule))
self._txtRules.append((rxRule, hlRule))

# Auto-Replace Tags
rxRule = QRegularExpression(r"<(\S+?)>")
rxRule.setPatternOptions(QRegExUnicode)
rxRule = re.compile(r"<(\S+?)>", re.UNICODE)
hlRule = {
0: self._hStyles["replace"],
}
Expand Down Expand Up @@ -409,12 +402,10 @@ def highlightBlock(self, text: str) -> None:

if hRules:
for rX, hRule in hRules:
rxItt = rX.globalMatch(text, xOff)
while rxItt.hasNext():
rxMatch = rxItt.next()
for match in re.finditer(rX, text[xOff:]):
for xM, hFmt in hRule.items():
xPos = rxMatch.capturedStart(xM)
xEnd = rxMatch.capturedEnd(xM)
xPos = match.start(xM) + xOff
xEnd = match.end(xM) + xOff
for x in range(xPos, xEnd):
cFmt = self.format(x)
if cFmt.fontStyleName() != "markup":
Expand All @@ -427,8 +418,8 @@ def highlightBlock(self, text: str) -> None:
self.setCurrentBlockUserData(data)

if self._spellCheck:
for xPos, xLen in data.spellCheck(text, xOff):
for x in range(xPos, xPos+xLen):
for xPos, xEnd in data.spellCheck(text, xOff):
for x in range(xPos, xEnd):
cFmt = self.format(x)
cFmt.merge(self._spellErr)
self.setFormat(x, 1, cFmt)
Expand Down Expand Up @@ -492,22 +483,20 @@ def spellCheck(self, text: str, offset: int) -> list[tuple[int, int]]:
"""
if "[" in text:
# Strip shortcodes
for rX in [SPELLSC, SPELLSV]:
rxItt = rX.globalMatch(text, offset)
while rxItt.hasNext():
rxMatch = rxItt.next()
xPos = rxMatch.capturedStart(0)
xLen = rxMatch.capturedLength(0)
xEnd = rxMatch.capturedEnd(0)
text = text[:xPos] + " "*xLen + text[xEnd:]
for rX in [RX_FMT_SC, RX_FMT_SV]:
for match in re.finditer(rX, text[offset:]):
iS = match.start(0) + offset
iE = match.end(0) + offset
if iS >= 0 and iE >= 0:
text = text[:iS] + " "*(iE - iS) + text[iE:]

self._spellErrors = []
rxSpell = SPELLRX.globalMatch(text.replace("_", " "), offset)
while rxSpell.hasNext():
rxMatch = rxSpell.next()
if not SHARED.spelling.checkWord(rxMatch.captured(0)):
if not rxMatch.captured(0).isnumeric() and not rxMatch.captured(0).isupper():
self._spellErrors.append(
(rxMatch.capturedStart(0), rxMatch.capturedLength(0))
)
checker = SHARED.spelling
for match in re.finditer(RX_WORDS, text[offset:].replace("_", " ")):
if (
(word := match.group(0))
and not (word.isnumeric() or word.isupper() or checker.checkWord(word))
):
self._spellErrors.append((match.start(0) + offset, match.end(0) + offset))

return self._spellErrors
4 changes: 2 additions & 2 deletions novelwriter/gui/editordocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ def spellErrorAtPos(self, pos: int) -> tuple[str, int, int, list[str]]:
text = block.text()
check = pos - block.position()
if check >= 0:
for cPos, cLen in data.spellErrors:
cEnd = cPos + cLen
for cPos, cEnd in data.spellErrors:
cLen = cEnd - cPos
if cPos <= check <= cEnd:
word = text[cPos:cEnd]
return word, cPos, cLen, SHARED.spelling.suggestWords(word)
Expand Down
Loading

0 comments on commit 5c07148

Please sign in to comment.