Ignore markdown codeblocks for tags (#66)

* Ignore markdown codeblocks for tags * Changed `content_ex_tags` to use `content` instead of `content_ex_codeblock` * Bumped `INDEX_SCHEMA_VERSION`
dullage · May 21, 2023 · 9d24d42 · 9d24d42
1 parent 2322e3f
commit 9d24d42
Showing 1 changed file with 5 additions and 2 deletions.
diff --git a/flatnotes/flatnotes.py b/flatnotes/flatnotes.py
@@ -20,7 +20,7 @@
 from logger import logger
 
 MARKDOWN_EXT = ".md"
-INDEX_SCHEMA_VERSION = "3"
+INDEX_SCHEMA_VERSION = "4"
 
 StemmingFoldingAnalyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
 
@@ -162,6 +162,7 @@ def _get_matched_fields(matched_terms):
 
 class Flatnotes(object):
     TAGS_RE = re.compile(r"(?:(?<=^#)|(?<=\s#))\w+(?=\s|$)")
+    CODEBLOCK_RE = re.compile(r"`{1,3}.*?`{1,3}", re.DOTALL)
     TAGS_WITH_HASH_RE = re.compile(r"(?:(?<=^)|(?<=\s))#\w+(?=\s|$)")
 
     def __init__(self, dir: str) -> None:
@@ -203,7 +204,9 @@ def extract_tags(cls, content) -> Tuple[str, Set[str]]:
 
         - The content without the tags.
         - A set of tags converted to lowercase."""
-        content_ex_tags, tags = re_extract(cls.TAGS_RE, content)
+        content_ex_codeblock = re.sub(cls.CODEBLOCK_RE, '', content)
+        _, tags = re_extract(cls.TAGS_RE, content_ex_codeblock)
+        content_ex_tags, _ = re_extract(cls.TAGS_RE, content)
         try:
             tags = [tag.lower() for tag in tags]
             return (content_ex_tags, set(tags))