transform_md.py: fix the split of history in md files

- be sure to keep tags when splitting files - keep the metadata from the original file in the split files Fixes: #54
flepied · Nov 29, 2024 · e9c97c7 · e9c97c7
1 parent e4765da
commit e9c97c7
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 45 deletions.
diff --git a/integration-test.sh b/integration-test.sh
@@ -2,13 +2,24 @@
 
 set -ex
 
-sudo apt-get install inotify-tools
+if [ -f /etc/redhat-release ]; then
+    sudo dnf install -y inotify-tools docker-compose
+else
+    sudo apt-get install inotify-tools docker-compose
+fi
+
+TOP=$(mktemp -d -p $HOME)
+
+trap cleanup 0
+
+mkdir $TOP/.second-brain $TOP/Notes
 
-mkdir $HOME/.second-brain $HOME/Notes
+# avoid losing my local env if testing locally :-)
+test ! -f .env
 
 cat > .env <<EOF
-SRCDIR=$HOME/Notes
-DSTDIR=$HOME/.second-brain
+SRCDIR=$TOP/Notes
+DSTDIR=$TOP/.second-brain
 EOF
 
 bash -x ./install-systemd-services.sh
@@ -41,7 +52,7 @@ docker-compose logs
 docker-compose logs | grep -q "Application startup complete"
 
 # create the document
-cat > $HOME/Notes/langchain.md <<EOF
+cat > $TOP/Notes/langchain.md <<EOF
 ## References
 
 - https://docs.langchain.com/docs/
@@ -56,16 +67,18 @@ EOF
 TRY=0
 while [ $TRY -lt 30 ]; do
     TRY=$(( TRY + 1 ))
-    if journalctl --user -u sba-txt | grep -q "Storing .* chunks to the db for metadata={'type': 'notes', 'url': 'file://$HOME/Notes/langchain.md'}'"; then
+    if journalctl --user -u sba-txt | grep -q "Storing .* chunks to the db for metadata={'type': 'notes', 'url': 'file://$TOP/Notes/langchain.md'}'"; then
         echo "*** Found finished marker"
         break
     fi
+    journalctl --user -u sba-txt -u sba-md
     sleep 1
 done
-journalctl --user -u sba-md
-journalctl --user -u sba-txt
 
-journalctl --user -u sba-md | grep -q "processed '$HOME/Notes/langchain.md'"
+journalctl --user -u sba-txt -u sba-md
+
+# do another check to stop if it is not present
+journalctl --user -u sba-md | grep -q "processed '$TOP/Notes/langchain.md'"
 
 # test the vector store
 RES=$(poetry run ./similarity.py "What is langchain?")
@@ -87,20 +100,20 @@ sleep 2
 sudo journalctl --user -u sba-md --rotate
 sudo journalctl --user -u sba-md --vacuum-time=1s
 
-touch $HOME/Notes/langchain.md
+touch $TOP/Notes/langchain.md
 
 TRY=0
 while [ $TRY -lt 30 ]; do
     TRY=$(( TRY + 1 ))
-    if journalctl --user -u sba-md | grep "skipping $HOME/Notes/langchain.md / .* as content did not change"; then
+    if journalctl --user -u sba-md | grep "skipping $TOP/Notes/langchain.md / .* as content did not change"; then
         echo "*** Found finished marker"
         break
     fi
     sleep 1
 done
 journalctl --user -u sba-md
-jq . $HOME/.second-brain/checksums.json
-journalctl --user -u sba-md | grep "skipping $HOME/Notes/langchain.md / .* as content did not change"
+jq . $TOP/.second-brain/checksums.json
+journalctl --user -u sba-md | grep "skipping $TOP/Notes/langchain.md / .* as content did not change"
 
 # wait a bit to be sure to have all the logs in different seconds
 # for the vacuum cleaning process to work
@@ -110,7 +123,7 @@ sleep 2
 sudo journalctl --user -u sba-md --rotate
 sudo journalctl --user -u sba-md --vacuum-time=1s
 
-cat >> $HOME/Notes/langchain.md <<EOF
+cat >> $TOP/Notes/langchain.md <<EOF
 ## Links
 
 - https://python.langchain.com/
@@ -119,7 +132,7 @@ EOF
 TRY=0
 while [ $TRY -lt 30 ]; do
     TRY=$(( TRY + 1 ))
-    if journalctl --user -u sba-md | grep -q "processed '$HOME/Notes/langchain.md'"; then
+    if journalctl --user -u sba-md | grep -q "processed '$TOP/Notes/langchain.md'"; then
         echo "*** Found finished marker"
         break
     fi
@@ -144,31 +157,31 @@ sudo journalctl --user -u sba-md --vacuum-time=1s
 sudo journalctl --user -u sba-txt --rotate
 sudo journalctl --user -u sba-txt --vacuum-time=1s
 
-rm "$HOME/Notes/langchain.md"
+rm "$TOP/Notes/langchain.md"
 
 TRY=0
 while [ $TRY -lt 5 ]; do
     TRY=$(( TRY + 1 ))
-    if journalctl --user -u sba-md | grep -q "removing $HOME/Text/langchain.json as $HOME/Notes/langchain.md do not exist anymore"; then
+    if journalctl --user -u sba-md | grep -q "removing $TOP/Text/langchain.json as $TOP/Notes/langchain.md do not exist anymore"; then
         echo "*** Found finished marker"
         break
     fi
     sleep 1
 done
 journalctl --user -u sba-md
-journalctl --user -u sba-md | grep -q "removing $HOME/.second-brain/Text/langchain.json as $HOME/Notes/langchain.md do not exist anymore"
+journalctl --user -u sba-md | grep -q "removing $TOP/.second-brain/Text/langchain.json as $TOP/Notes/langchain.md do not exist anymore"
 
 TRY=0
 while [ $TRY -lt 5 ]; do
     TRY=$(( TRY + 1 ))
-    if journalctl --user -u sba-txt | grep -q "Removing .* related files to $HOME/.second-brain/Text/langchain.json:"; then
+    if journalctl --user -u sba-txt | grep -q "Removing .* related files to $TOP/.second-brain/Text/langchain.json:"; then
         echo "*** Found finished marker"
         break
     fi
     sleep 1
 done
 journalctl --user -u sba-txt
-journalctl --user -u sba-txt | grep -q "Removing .* related files to $HOME/.second-brain/Text/langchain.json:"
+journalctl --user -u sba-txt | grep -q "Removing .* related files to $TOP/.second-brain/Text/langchain.json:"
 
 # be sure we don't have anymore document in the vector database
 poetry run ./similarity.py ""

diff --git a/monitor.sh b/monitor.sh
@@ -31,6 +31,8 @@ fi
 
 "$TRANSFORM" "$SRC" "$DST"
 
+echo "Finished processing $SRC ($TRANSFORM)"
+
 inotifywait -m -e CLOSE_WRITE,DELETE "$SRC"|while read dir event fname; do echo "${dir}${fname} $event" 1>&2; echo "${dir}${fname}"; done | "$TRANSFORM" "-" "$DST"
 
 # monitor.sh ends here
diff --git a/transform_md.py b/transform_md.py
@@ -19,20 +19,20 @@
 import assemblyai as aai
 import yt_dlp
 from dotenv import load_dotenv
-from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
-from langchain.document_loaders.generic import GenericLoader
-from langchain.document_loaders.parsers import OpenAIWhisperParser
 from langchain_community.document_loaders import (
     AssemblyAIAudioTranscriptLoader,
     PyMuPDFLoader,
     UnstructuredURLLoader,
+    YoutubeAudioLoader,
 )
+from langchain_community.document_loaders.generic import GenericLoader
+from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
 from youtube_transcript_api import YouTubeTranscriptApi, _errors
 
 from lib import ChecksumStore, DateTimeEncoder, is_history_filename, is_same_time
 
 YOUTUBE_REGEX = re.compile(r"https://www.youtube.com/embed/([^/\"]+)")
-HTTP_REGEX = re.compile(r"https://[^ ]+")
+HTTP_REGEX = re.compile(r"https://[^ ]+|file://[^ ]+|~/.+")
 IGNORED_REGEX = re.compile(r"^https://(docs.google.com|source.redhat.com)")
 
 
@@ -124,6 +124,8 @@ def process_url_line(basename, line, directory, last_accessed_at):
     res = HTTP_REGEX.search(line)
     if res:
         url = res.group(0)
+        if url.startswith("~/"):
+            url = "file://" + os.path.expanduser(url)
         print(f"found url {url}", file=sys.stderr)
         # skip private or local network urls
         if (
@@ -279,7 +281,9 @@ def remove_dash(content, level):
     lines = content.split("\n")
     dashes = "#" * level
     for idx, line in enumerate(lines):
-        if line.startswith(dashes):
+        # remove the dashes from the beginning of the line if this is
+        # a title #+ <title> to keep tags like #tag1
+        if line.startswith(dashes) and re.match(r"^#+ ", line):
             lines[idx] = line[level:].strip()
     return "\n".join(lines)
 
@@ -296,11 +300,6 @@ def get_date(date_str):
             return date_str
 
 
-def clean_referer(referer):
-    "remove numbers from the referer"
-    return re.sub(r"\d+", "", referer)
-
-
 DATE2_REGEXP = re.compile(r"^## (\d\d \w+ \d\d\d\d)", re.MULTILINE)
 DATE3_REGEXP = re.compile(r"^### (\d\d \w+ \d\d\d\d)", re.MULTILINE)
 
@@ -323,31 +322,46 @@ def split_md_file(fname, md_dir):
         files = [fname]
         level = 0
     if len(history) >= 3:
+        files = generate_history_files(md_dir, basename, history, level)
         base_fname = os.path.join(md_dir, basename + ".md")
         with open(base_fname, "w", encoding="UTF-8") as fptr:
             fptr.write(history[0])
         files.append(base_fname)
         stat = os.stat(fname)
         os.utime(base_fname, (stat.st_atime, stat.st_mtime))
-        for idx in range(1, len(history), 2):
-            history_date = get_date(history[idx])
-            if isinstance(history_date, str):
-                continue
-            if level == 1:
-                date = history_date.strftime("%d")
-            else:
-                date = history_date.strftime("%Y%m%d")
-            part_fname = os.path.join(md_dir, basename + date + ".md")
-            with open(part_fname, "w", encoding="UTF-8") as fptr:
-                fptr.write(f"---\nReferer: {clean_referer(basename)}\n---\n\n")
-                fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level))
-            mtime = (history_date + datetime.timedelta(hours=12)).timestamp()
-            os.utime(part_fname, (mtime, mtime))
-            files.append(part_fname)
+
     print(f"found {len(files)} history files", file=sys.stderr)
     return files
 
 
+def generate_history_files(md_dir, basename, history, level):
+    "Generate history MarkDown files from a list of history entries"
+    files = []
+    # extract header metadata from base file
+    metadata, _ = get_metadata(history[0])
+    metadata["referer"] = basename
+    for idx in range(1, len(history), 2):
+        history_date = get_date(history[idx])
+        if isinstance(history_date, str):
+            continue
+        if level == 1:
+            date = history_date.strftime("%d")
+        else:
+            date = history_date.strftime("%Y%m%d")
+        part_fname = os.path.join(md_dir, basename + date + ".md")
+        with open(part_fname, "w", encoding="UTF-8") as fptr:
+            # write the metadata between --- and ---
+            fptr.write("---\n")
+            for key, value in metadata.items():
+                fptr.write(f"{key}: {value}\n")
+            fptr.write("---\n\n")
+            fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level))
+        mtime = (history_date + datetime.timedelta(hours=12)).timestamp()
+        os.utime(part_fname, (mtime, mtime))
+        files.append(part_fname)
+    return files
+
+
 def write_output_file(md_file, out_dir, metadata):
     "Write the output json file from a markdown file and process the its content"
     with open(md_file, "r", encoding="UTF-8") as fptr: