Skip to content

Commit

Permalink
add support for deleting files
Browse files Browse the repository at this point in the history
  • Loading branch information
flepied committed Sep 20, 2023
1 parent 3eac15d commit 76d5ce9
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 20 deletions.
74 changes: 57 additions & 17 deletions integration-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,35 +11,25 @@ SRCDIR=$HOME/Notes
DSTDIR=$HOME/.second-brain
EOF

cat > $HOME/Notes/langchain.md <<EOF
## References
- https://docs.langchain.com/docs/
- https://blog.langchain.dev/conversational-retrieval-agents/
Plan-and-Solve Prompting: Improving Zero-Shot
Chain-of-Thought Reasoning by Large Language Models
https://arxiv.org/pdf/2305.04091.pdf
EOF

bash -x ./install-systemd-services.sh

sleep 5

TRY=0
# wait for chromadb to be started
docker-compose ps
while [ $TRY -lt 24 ]; do
TRY=$(( TRY + 1 ))
TRY=$(( TRY + 1 ))
if docker-compose logs | grep -q "Application startup complete"; then
echo "*** Found finished marker"
break
fi
fi
docker-compose logs
sleep 5
done
docker-compose logs
docker-compose logs | grep -q "Application startup complete"

sudo journalctl -u sba-md
sudo journalctl -u sba-txt

# create the document
cat > $HOME/Notes/langchain.md <<EOF
## References
Expand All @@ -51,6 +41,22 @@ Chain-of-Thought Reasoning by Large Language Models
https://arxiv.org/pdf/2305.04091.pdf
EOF

# wait for the document to be processed
TRY=0
while [ $TRY -lt 30 ]; do
TRY=$(( TRY + 1 ))
if sudo journalctl -u sba-md | grep -q "processed '$HOME/Notes/langchain.md'"; then
echo "*** Found finished marker"
break
fi
sleep 1
done

sudo journalctl -u sba-md
sudo journalctl -u sba-txt

sudo journalctl -u sba-md | grep -q "processed '$HOME/Notes/langchain.md'"

# test the vector store
RES=$(poetry run ./similarity.py "What is langchain?")
echo "$RES"
Expand Down Expand Up @@ -118,4 +124,38 @@ NB=$(sudo journalctl -u sba-md | grep -c "content is the same for")
echo "*** NB=$NB"
test "$NB" -eq 2

# wait a bit to be sure to have all the logs in different seconds
# for the vacuum cleaning process to work
sleep 2

# test removing a document
sudo journalctl -u sba-md --rotate
sudo journalctl -u sba-md --vacuum-time=1s
sudo journalctl -u sba-txt --rotate
sudo journalctl -u sba-txt --vacuum-time=1s

rm "$HOME/Notes/langchain.md"

TRY=0
while [ $TRY -lt 5 ]; do
TRY=$(( TRY + 1 ))
if sudo journalctl -u sba-md | grep -q "removing $HOME/Text/langchain.json as $HOME/Notes/langchain.md do not exist anymore"; then
echo "*** Found finished marker"
break
fi
sleep 1
done
sudo journalctl -u sba-md | grep -q "removing $HOME/Text/langchain.json as $HOME/Notes/langchain.md do not exist anymore"

TRY=0
while [ $TRY -lt 5 ]; do
TRY=$(( TRY + 1 ))
if sudo journalctl -u sba-txt | grep -q "Removing $NB related files to $HOME/Text/langchain.json:"; then
echo "*** Found finished marker"
break
fi
sleep 1
done
sudo journalctl -u sba-txt | grep -q "Removing $NB related files to $HOME/Text/langchain.json:"

# integration-test.sh ends here
4 changes: 1 addition & 3 deletions monitor.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,8 @@ else
. .venv/bin/activate
fi



"$TRANSFORM" "$SRC" "$DST"

inotifywait -m -e CLOSE_WRITE "$SRC"|while read dir event fname; do echo "${dir}${fname} $event" 1>&2; echo "${dir}${fname}"; done | "$TRANSFORM" "-" "$DST"
inotifywait -m -e CLOSE_WRITE,DELETE "$SRC"|while read dir event fname; do echo "${dir}${fname} $event" 1>&2; echo "${dir}${fname}"; done | "$TRANSFORM" "-" "$DST"

# monitor.sh ends here
17 changes: 17 additions & 0 deletions transform_md.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
"""

import datetime
import glob
import hashlib
import json
import os
Expand Down Expand Up @@ -393,6 +394,22 @@ def process_md_file(fname, out_dir, checksum_store):
return False
basename = os.path.basename(fname[:-3])
oname = get_output_file_path(out_dir, basename)
if not os.path.exists(fname):
print(f"removing {oname} as {fname} do not exist anymore", file=sys.stderr)
os.remove(oname)
if is_history_filename(fname):
for hname in glob.glob(
os.path.join(out_dir, "Markdown", basename + "*.md")
):
basename = os.path.basename(hname[:-3])
oname = get_output_file_path(out_dir, basename)
print(
f"removing {hname} / {oname} as {fname} do not exist anymore",
file=sys.stderr,
)
os.remove(hname)
os.remove(oname)
return True
if is_same_time(fname, oname):
print(f"skipping {fname} as there is no time change", file=sys.stderr)
return False
Expand Down
17 changes: 17 additions & 0 deletions transform_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,27 @@ def validate_and_extract_url(fname, basename, out_dir):
return metadata, data["text"]


def remove_related_files(fname, indexer, out_dir):
"Remove related files"
results = indexer.get(where={"main_source": {"$eq": fname}})
print(
f"Removing {len(results['ids'])} related files to {fname}: {' '.join(results['ids'])}",
file=sys.stderr,
)
if len(results["ids"]) > 0:
indexer.delete(results["ids"])
for chunk_id in results["ids"]:
os.remove(os.path.join(out_dir, "Chunk", chunk_id))


def process_file(fname: str, out_dir: str, indexer, splitter):
"Cut a text file in multiple chunks"
basename = os.path.basename(fname[:-5])
print(f"Processing '{fname}' '{basename}'", file=sys.stderr)
if not os.path.exists(fname):
print(f"File {fname} does not exist anymore", file=sys.stderr)
remove_related_files(fname, indexer, out_dir)
return
metadata, content = validate_and_extract_url(fname, basename, out_dir)
if metadata is False:
return
Expand Down

0 comments on commit 76d5ce9

Please sign in to comment.