Skip to content

Commit

Permalink
transform_md.py: fix the split of history in md files
Browse files Browse the repository at this point in the history
- be sure to keep tags when splitting files
- keep the metadata from the original file in the split files

Fixes: #54
  • Loading branch information
flepied committed Nov 29, 2024
1 parent e4765da commit e9c97c7
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 45 deletions.
53 changes: 33 additions & 20 deletions integration-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,24 @@

set -ex

sudo apt-get install inotify-tools
if [ -f /etc/redhat-release ]; then
sudo dnf install -y inotify-tools docker-compose
else
sudo apt-get install inotify-tools docker-compose
fi

TOP=$(mktemp -d -p $HOME)

trap cleanup 0

mkdir $TOP/.second-brain $TOP/Notes

mkdir $HOME/.second-brain $HOME/Notes
# avoid losing my local env if testing locally :-)
test ! -f .env

cat > .env <<EOF
SRCDIR=$HOME/Notes
DSTDIR=$HOME/.second-brain
SRCDIR=$TOP/Notes
DSTDIR=$TOP/.second-brain
EOF

bash -x ./install-systemd-services.sh
Expand Down Expand Up @@ -41,7 +52,7 @@ docker-compose logs
docker-compose logs | grep -q "Application startup complete"

# create the document
cat > $HOME/Notes/langchain.md <<EOF
cat > $TOP/Notes/langchain.md <<EOF
## References
- https://docs.langchain.com/docs/
Expand All @@ -56,16 +67,18 @@ EOF
TRY=0
while [ $TRY -lt 30 ]; do
TRY=$(( TRY + 1 ))
if journalctl --user -u sba-txt | grep -q "Storing .* chunks to the db for metadata={'type': 'notes', 'url': 'file://$HOME/Notes/langchain.md'}'"; then
if journalctl --user -u sba-txt | grep -q "Storing .* chunks to the db for metadata={'type': 'notes', 'url': 'file://$TOP/Notes/langchain.md'}'"; then
echo "*** Found finished marker"
break
fi
journalctl --user -u sba-txt -u sba-md
sleep 1
done
journalctl --user -u sba-md
journalctl --user -u sba-txt

journalctl --user -u sba-md | grep -q "processed '$HOME/Notes/langchain.md'"
journalctl --user -u sba-txt -u sba-md

# do another check to stop if it is not present
journalctl --user -u sba-md | grep -q "processed '$TOP/Notes/langchain.md'"

# test the vector store
RES=$(poetry run ./similarity.py "What is langchain?")
Expand All @@ -87,20 +100,20 @@ sleep 2
sudo journalctl --user -u sba-md --rotate
sudo journalctl --user -u sba-md --vacuum-time=1s

touch $HOME/Notes/langchain.md
touch $TOP/Notes/langchain.md

TRY=0
while [ $TRY -lt 30 ]; do
TRY=$(( TRY + 1 ))
if journalctl --user -u sba-md | grep "skipping $HOME/Notes/langchain.md / .* as content did not change"; then
if journalctl --user -u sba-md | grep "skipping $TOP/Notes/langchain.md / .* as content did not change"; then
echo "*** Found finished marker"
break
fi
sleep 1
done
journalctl --user -u sba-md
jq . $HOME/.second-brain/checksums.json
journalctl --user -u sba-md | grep "skipping $HOME/Notes/langchain.md / .* as content did not change"
jq . $TOP/.second-brain/checksums.json
journalctl --user -u sba-md | grep "skipping $TOP/Notes/langchain.md / .* as content did not change"

# wait a bit to be sure to have all the logs in different seconds
# for the vacuum cleaning process to work
Expand All @@ -110,7 +123,7 @@ sleep 2
sudo journalctl --user -u sba-md --rotate
sudo journalctl --user -u sba-md --vacuum-time=1s

cat >> $HOME/Notes/langchain.md <<EOF
cat >> $TOP/Notes/langchain.md <<EOF
## Links
- https://python.langchain.com/
Expand All @@ -119,7 +132,7 @@ EOF
TRY=0
while [ $TRY -lt 30 ]; do
TRY=$(( TRY + 1 ))
if journalctl --user -u sba-md | grep -q "processed '$HOME/Notes/langchain.md'"; then
if journalctl --user -u sba-md | grep -q "processed '$TOP/Notes/langchain.md'"; then
echo "*** Found finished marker"
break
fi
Expand All @@ -144,31 +157,31 @@ sudo journalctl --user -u sba-md --vacuum-time=1s
sudo journalctl --user -u sba-txt --rotate
sudo journalctl --user -u sba-txt --vacuum-time=1s

rm "$HOME/Notes/langchain.md"
rm "$TOP/Notes/langchain.md"

TRY=0
while [ $TRY -lt 5 ]; do
TRY=$(( TRY + 1 ))
if journalctl --user -u sba-md | grep -q "removing $HOME/Text/langchain.json as $HOME/Notes/langchain.md do not exist anymore"; then
if journalctl --user -u sba-md | grep -q "removing $TOP/Text/langchain.json as $TOP/Notes/langchain.md do not exist anymore"; then
echo "*** Found finished marker"
break
fi
sleep 1
done
journalctl --user -u sba-md
journalctl --user -u sba-md | grep -q "removing $HOME/.second-brain/Text/langchain.json as $HOME/Notes/langchain.md do not exist anymore"
journalctl --user -u sba-md | grep -q "removing $TOP/.second-brain/Text/langchain.json as $TOP/Notes/langchain.md do not exist anymore"

TRY=0
while [ $TRY -lt 5 ]; do
TRY=$(( TRY + 1 ))
if journalctl --user -u sba-txt | grep -q "Removing .* related files to $HOME/.second-brain/Text/langchain.json:"; then
if journalctl --user -u sba-txt | grep -q "Removing .* related files to $TOP/.second-brain/Text/langchain.json:"; then
echo "*** Found finished marker"
break
fi
sleep 1
done
journalctl --user -u sba-txt
journalctl --user -u sba-txt | grep -q "Removing .* related files to $HOME/.second-brain/Text/langchain.json:"
journalctl --user -u sba-txt | grep -q "Removing .* related files to $TOP/.second-brain/Text/langchain.json:"

# be sure we don't have anymore document in the vector database
poetry run ./similarity.py ""
Expand Down
2 changes: 2 additions & 0 deletions monitor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ fi

"$TRANSFORM" "$SRC" "$DST"

echo "Finished processing $SRC ($TRANSFORM)"

inotifywait -m -e CLOSE_WRITE,DELETE "$SRC"|while read dir event fname; do echo "${dir}${fname} $event" 1>&2; echo "${dir}${fname}"; done | "$TRANSFORM" "-" "$DST"

# monitor.sh ends here
64 changes: 39 additions & 25 deletions transform_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,20 @@
import assemblyai as aai
import yt_dlp
from dotenv import load_dotenv
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser
from langchain_community.document_loaders import (
AssemblyAIAudioTranscriptLoader,
PyMuPDFLoader,
UnstructuredURLLoader,
YoutubeAudioLoader,
)
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers.audio import OpenAIWhisperParser
from youtube_transcript_api import YouTubeTranscriptApi, _errors

from lib import ChecksumStore, DateTimeEncoder, is_history_filename, is_same_time

YOUTUBE_REGEX = re.compile(r"https://www.youtube.com/embed/([^/\"]+)")
HTTP_REGEX = re.compile(r"https://[^ ]+")
HTTP_REGEX = re.compile(r"https://[^ ]+|file://[^ ]+|~/.+")
IGNORED_REGEX = re.compile(r"^https://(docs.google.com|source.redhat.com)")


Expand Down Expand Up @@ -124,6 +124,8 @@ def process_url_line(basename, line, directory, last_accessed_at):
res = HTTP_REGEX.search(line)
if res:
url = res.group(0)
if url.startswith("~/"):
url = "file://" + os.path.expanduser(url)
print(f"found url {url}", file=sys.stderr)
# skip private or local network urls
if (
Expand Down Expand Up @@ -279,7 +281,9 @@ def remove_dash(content, level):
lines = content.split("\n")
dashes = "#" * level
for idx, line in enumerate(lines):
if line.startswith(dashes):
# remove the dashes from the beginning of the line if this is
# a title #+ <title> to keep tags like #tag1
if line.startswith(dashes) and re.match(r"^#+ ", line):
lines[idx] = line[level:].strip()
return "\n".join(lines)

Expand All @@ -296,11 +300,6 @@ def get_date(date_str):
return date_str


def clean_referer(referer):
"remove numbers from the referer"
return re.sub(r"\d+", "", referer)


DATE2_REGEXP = re.compile(r"^## (\d\d \w+ \d\d\d\d)", re.MULTILINE)
DATE3_REGEXP = re.compile(r"^### (\d\d \w+ \d\d\d\d)", re.MULTILINE)

Expand All @@ -323,31 +322,46 @@ def split_md_file(fname, md_dir):
files = [fname]
level = 0
if len(history) >= 3:
files = generate_history_files(md_dir, basename, history, level)
base_fname = os.path.join(md_dir, basename + ".md")
with open(base_fname, "w", encoding="UTF-8") as fptr:
fptr.write(history[0])
files.append(base_fname)
stat = os.stat(fname)
os.utime(base_fname, (stat.st_atime, stat.st_mtime))
for idx in range(1, len(history), 2):
history_date = get_date(history[idx])
if isinstance(history_date, str):
continue
if level == 1:
date = history_date.strftime("%d")
else:
date = history_date.strftime("%Y%m%d")
part_fname = os.path.join(md_dir, basename + date + ".md")
with open(part_fname, "w", encoding="UTF-8") as fptr:
fptr.write(f"---\nReferer: {clean_referer(basename)}\n---\n\n")
fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level))
mtime = (history_date + datetime.timedelta(hours=12)).timestamp()
os.utime(part_fname, (mtime, mtime))
files.append(part_fname)

print(f"found {len(files)} history files", file=sys.stderr)
return files


def generate_history_files(md_dir, basename, history, level):
"Generate history MarkDown files from a list of history entries"
files = []
# extract header metadata from base file
metadata, _ = get_metadata(history[0])
metadata["referer"] = basename
for idx in range(1, len(history), 2):
history_date = get_date(history[idx])
if isinstance(history_date, str):
continue
if level == 1:
date = history_date.strftime("%d")
else:
date = history_date.strftime("%Y%m%d")
part_fname = os.path.join(md_dir, basename + date + ".md")
with open(part_fname, "w", encoding="UTF-8") as fptr:
# write the metadata between --- and ---
fptr.write("---\n")
for key, value in metadata.items():
fptr.write(f"{key}: {value}\n")
fptr.write("---\n\n")
fptr.write("# " + history[idx] + remove_dash(history[idx + 1], level))
mtime = (history_date + datetime.timedelta(hours=12)).timestamp()
os.utime(part_fname, (mtime, mtime))
files.append(part_fname)
return files


def write_output_file(md_file, out_dir, metadata):
"Write the output json file from a markdown file and process the its content"
with open(md_file, "r", encoding="UTF-8") as fptr:
Expand Down

0 comments on commit e9c97c7

Please sign in to comment.