From d85d991630e50bec9607667840cad946ef988864 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Fri, 4 Oct 2024 13:33:47 +1000 Subject: [PATCH 1/2] Added support for .syftignore files (simple prefix matching) --- default_apps/adder/main.py | 8 ++- default_apps/logged_in/main.py | 13 ++--- syftbox/client/plugins/sync.py | 95 +++++++++++++++++++++++++++++++--- 3 files changed, 100 insertions(+), 16 deletions(-) diff --git a/default_apps/adder/main.py b/default_apps/adder/main.py index b5d8def2..06416281 100644 --- a/default_apps/adder/main.py +++ b/default_apps/adder/main.py @@ -6,8 +6,12 @@ config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None) client_config = ClientConfig.load(config_path) -input_folder = f"{client_config.sync_folder}/{client_config.email}/app_pipelines/adder/inputs/" -output_folder = f"{client_config.sync_folder}/{client_config.email}/app_pipelines/adder/done/" +input_folder = ( + f"{client_config.sync_folder}/{client_config.email}/app_pipelines/adder/inputs/" +) +output_folder = ( + f"{client_config.sync_folder}/{client_config.email}/app_pipelines/adder/done/" +) os.makedirs(input_folder, exist_ok=True) os.makedirs(output_folder, exist_ok=True) diff --git a/default_apps/logged_in/main.py b/default_apps/logged_in/main.py index d006f30f..f950aee3 100644 --- a/default_apps/logged_in/main.py +++ b/default_apps/logged_in/main.py @@ -1,8 +1,10 @@ -import os import json +import os from datetime import datetime + from syftbox.lib import ClientConfig + def main(): # Load the client configuration config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None) @@ -12,9 +14,7 @@ def main(): current_timestamp = datetime.now().isoformat() # Prepare the data to be written - timestamp_data = { - "last_check_in": current_timestamp - } + timestamp_data = {"last_check_in": current_timestamp} # Prepare output folders output_folder = f"{client_config.sync_folder}/{client_config.email}/app_pipelines/timestamp_recorder/" @@ -31,7 +31,7 @@ def main(): "read": ["GLOBAL"], "write": [client_config.email], "filepath": f"{output_folder}_.syftperm", - "terminal": False + "terminal": False, } syftperm_path = f"{output_folder}_.syftperm" with open(syftperm_path, "w") as f: @@ -40,5 +40,6 @@ def main(): print(f"Timestamp has been written to {output_file_path}") print(f"_.syftperm file has been written to {syftperm_path}") + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/syftbox/client/plugins/sync.py b/syftbox/client/plugins/sync.py index 17056249..f4d09b0d 100644 --- a/syftbox/client/plugins/sync.py +++ b/syftbox/client/plugins/sync.py @@ -3,6 +3,7 @@ from collections import defaultdict from datetime import datetime from threading import Event +from typing import Tuple import requests from watchdog.events import DirModifiedEvent @@ -26,6 +27,49 @@ IGNORE_FOLDERS = [CLIENT_CHANGELOG_FOLDER, STAGING, CLIENT_APPS] +def get_ignore_rules(dir_state: DirState) -> Tuple[str, str, str]: + # get the ignore files + syft_ignore_files = [] + folder_path = dir_state.sync_folder + "/" + dir_state.sub_path + for afile, file_info in dir_state.tree.items(): + full_path = folder_path + "/" + afile + sub_folder = os.path.dirname(full_path) + + if afile.endswith(".syftignore") and os.path.isfile(full_path): + ignore_list = [] + with open(full_path) as f: + ignore_list = f.readlines() + for ignore_rule in ignore_list: + ignore_rule = ignore_rule.strip() + rule_prefix = sub_folder + "/" + ignore_rule + syft_ignore_files.append((rule_prefix, sub_folder, afile)) + + return syft_ignore_files + + +def filter_ignore_files(dir_state: DirState) -> DirState: + # get the ignore files + pruned_tree = dir_state.tree.copy() + folder_path = dir_state.sync_folder + "/" + dir_state.sub_path + syft_ignore_files = get_ignore_rules(dir_state) + + for rule_prefix, ignore_folder, ignore_file_path in syft_ignore_files: + for afile, file_info in dir_state.tree.items(): + full_path = folder_path + "/" + afile + if full_path.startswith(rule_prefix): + # print("> File ignored by .syftignore", afile, ignore_rule) + if afile in pruned_tree: + del pruned_tree[afile] + + now = datetime.now().timestamp() + return DirState( + tree=pruned_tree, + timestamp=now, + sync_folder=dir_state.sync_folder, + sub_path=dir_state.sub_path, + ) + + # Recursive function to add folder structure def add_to_folder_tree(leaf, parts): if not parts: @@ -360,6 +404,20 @@ def handle_empty_folders(client_config, datasite): return changes +def filter_changes_ignore(pre_filter_changes, syft_ignore_files): + filtered_changes = [] + for change in pre_filter_changes: + keep = True + for syft_ignore in syft_ignore_files: + if change.full_path.startswith(syft_ignore[0]): + keep = False + break + if keep: + filtered_changes.append(change) + + return filtered_changes + + def sync_up(client_config): # create a folder to store the change log change_log_folder = f"{client_config.sync_folder}/{CLIENT_CHANGELOG_FOLDER}" @@ -399,15 +457,25 @@ def sync_up(client_config): ) # get the new dir state - new_dir_state = hash_dir(client_config.sync_folder, datasite, IGNORE_FOLDERS) - changes = diff_dirstate(old_dir_state, new_dir_state) + unfiltered_new_dir_state = hash_dir( + client_config.sync_folder, datasite, IGNORE_FOLDERS + ) + + # ignore files + syft_ignore_files = get_ignore_rules(unfiltered_new_dir_state) + new_dir_state = filter_ignore_files(unfiltered_new_dir_state) + + pre_filter_changes = diff_dirstate(old_dir_state, new_dir_state) # Add handling for empty folders empty_folder_changes = handle_empty_folders(client_config, datasite) - changes.extend(empty_folder_changes) + pre_filter_changes.extend(empty_folder_changes) + + changes = filter_changes_ignore(pre_filter_changes, syft_ignore_files) if len(changes) == 0: continue + val, val_files, inval = filter_changes(client_config.email, changes, perm_tree) # send val changes @@ -466,17 +534,27 @@ def sync_down(client_config) -> int: # perm_tree = PermissionTree.from_path(datasite_path) # get the new dir state - new_dir_state = hash_dir(client_config.sync_folder, datasite, IGNORE_FOLDERS) + + unfiltered_new_dir_state = hash_dir( + client_config.sync_folder, datasite, IGNORE_FOLDERS + ) + syft_ignore_files = get_ignore_rules(unfiltered_new_dir_state) + + # ignore files + new_dir_state = filter_ignore_files(unfiltered_new_dir_state) + remote_dir_state = get_remote_state(client_config, datasite) if not remote_dir_state: # print(f"No remote state for dir: {datasite}") continue - changes = diff_dirstate(new_dir_state, remote_dir_state) + pre_filter_changes = diff_dirstate(new_dir_state, remote_dir_state) # Add handling for empty folders empty_folder_changes = handle_empty_folders(client_config, datasite) - changes.extend(empty_folder_changes) + pre_filter_changes.extend(empty_folder_changes) + + changes = filter_changes_ignore(pre_filter_changes, syft_ignore_files) if len(changes) == 0: continue @@ -527,11 +605,12 @@ def sync_down(client_config) -> int: synced_dir_state = prune_invalid_changes(new_dir_state, changed_files) # combine successfulc hanges qwith old dir state - combined_tree = new_dir_state.tree + # we use unfiltered so they keep being ignored but we could change these to another list? + combined_tree = unfiltered_new_dir_state.tree combined_tree.update(synced_dir_state.tree) synced_dir_state.tree = combined_tree - synced_dir_state = delete_files(new_dir_state, deleted_files) + synced_dir_state = delete_files(synced_dir_state, deleted_files) change_text = "" if len(changed_files): From 57dab3afc767d66b028c772e3407163259f114b0 Mon Sep 17 00:00:00 2001 From: Madhava Jay Date: Mon, 7 Oct 2024 15:03:59 +1000 Subject: [PATCH 2/2] Fixed linting issues --- default_apps/github_app_updater/run.sh | 20 +-- syftbox/lib/lib.py | 231 ++++++++++++------------- syftbox/server/server.py | 4 +- 3 files changed, 127 insertions(+), 128 deletions(-) diff --git a/default_apps/github_app_updater/run.sh b/default_apps/github_app_updater/run.sh index 549fa117..83859fd4 100755 --- a/default_apps/github_app_updater/run.sh +++ b/default_apps/github_app_updater/run.sh @@ -50,15 +50,15 @@ should_update() { repo_name="$1" update_frequency="$2" last_update=$(get_last_update "$repo_name") - + # If there's no last update time, it's time to update if [ -z "$last_update" ]; then return 0 fi - + current_time=$(date +%s) time_since_last_update=$((current_time - last_update)) - + # Check if enough time has passed since the last update [ "$time_since_last_update" -ge "$update_frequency" ] } @@ -69,27 +69,27 @@ while IFS=',' read -r repo_url update_frequency update_type || [ -n "$repo_url" repo_url=$(echo "$repo_url" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') update_frequency=$(echo "$update_frequency" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') update_type=$(echo "$update_type" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//') - + # Skip empty lines if [ -z "$repo_url" ]; then continue fi - + # Extract the repository name from the URL repo_name=$(echo "$repo_url" | sed -e 's/.*\///' -e 's/\.git$//') repo_path="../$repo_name" - + # Check if it's time to update this repository if ! should_update "$repo_name" "$update_frequency"; then echo "Skipping $repo_name, not time to update yet" continue fi - + echo "Processing repository: $repo_name" echo "Local path: $repo_path" echo "Update frequency: every $update_frequency seconds" echo "Update type: $update_type" - + # Check if the repository already exists if [ -d "$repo_path" ]; then if [ "$update_type" = "REPLACE" ]; then @@ -132,10 +132,10 @@ while IFS=',' read -r repo_url update_frequency update_type || [ -n "$repo_url" echo "Failed to clone $repo_name" fi fi - + # Update the last update time for this repository set_last_update "$repo_name" - + echo "-----------------------------------" done < "github_apps.csv" diff --git a/syftbox/lib/lib.py b/syftbox/lib/lib.py index 56256fae..cfa3cf5a 100644 --- a/syftbox/lib/lib.py +++ b/syftbox/lib/lib.py @@ -7,7 +7,6 @@ import re import threading import zlib -from collections.abc import Callable from dataclasses import dataclass from datetime import datetime from enum import Enum @@ -266,39 +265,39 @@ def newer(self) -> bool: return False def read(self) -> bytes: - if is_symlink(self.full_path): - # write a text file with a syftlink - data = convert_to_symlink(self.full_path).encode("utf-8") - return data - else: - with open(self.full_path, "rb") as f: - return f.read() + # if is_symlink(self.full_path): + # # write a text file with a syftlink + # data = convert_to_symlink(self.full_path).encode("utf-8") + # return data + # else: + with open(self.full_path, "rb") as f: + return f.read() def write(self, data: bytes) -> bool: # if its a non private syftlink turn it into a symlink - if data.startswith(b"syft://") and not self.full_path.endswith(".private"): - syft_link = SyftLink.from_url(data.decode("utf-8")) - abs_path = os.path.join( - os.path.abspath(self.sync_folder), syft_link.sync_path - ) - if not os.path.exists(abs_path): - raise Exception( - f"Cant make symlink because source doesnt exist {abs_path}" - ) - dir_path = os.path.dirname(self.full_path) - os.makedirs(dir_path, exist_ok=True) - if os.path.exists(self.full_path) and is_symlink(self.full_path): - os.unlink(self.full_path) - os.symlink(abs_path, self.full_path) - os.utime( - self.full_path, - (self.last_modified, self.last_modified), - follow_symlinks=False, - ) - - return True - else: - return self.write_to(data, self.full_path) + # if data.startswith(b"syft://") and not self.full_path.endswith(".private"): + # syft_link = SyftLink.from_url(data.decode("utf-8")) + # abs_path = os.path.join( + # os.path.abspath(self.sync_folder), syft_link.sync_path + # ) + # if not os.path.exists(abs_path): + # raise Exception( + # f"Cant make symlink because source doesnt exist {abs_path}" + # ) + # dir_path = os.path.dirname(self.full_path) + # os.makedirs(dir_path, exist_ok=True) + # if os.path.exists(self.full_path) and is_symlink(self.full_path): + # os.unlink(self.full_path) + # os.symlink(abs_path, self.full_path) + # os.utime( + # self.full_path, + # (self.last_modified, self.last_modified), + # follow_symlinks=False, + # ) + + # return True + # else: + return self.write_to(data, self.full_path) def delete(self) -> bool: try: @@ -343,16 +342,16 @@ def is_symlink(file_path) -> bool: return os.path.islink(file_path) -def symlink_to_syftlink(file_path): - return SyftLink.from_path(file_path) +# def symlink_to_syftlink(file_path): +# return SyftLink.from_path(file_path) -def convert_to_symlink(path): - if not is_symlink(path): - raise Exception(f"Cant convert a non symlink {path}") - abs_path = get_symlink(path) - syft_link = symlink_to_syftlink(abs_path) - return str(syft_link) +# def convert_to_symlink(path): +# if not is_symlink(path): +# raise Exception(f"Cant convert a non symlink {path}") +# abs_path = get_symlink(path) +# syft_link = symlink_to_syftlink(abs_path) +# return str(syft_link) def get_file_last_modified(file_path: str) -> float: @@ -360,13 +359,13 @@ def get_file_last_modified(file_path: str) -> float: def get_file_hash(file_path: str) -> str: - if is_symlink(file_path): - # return the hash of the syftlink instead - sym_link_string = convert_to_symlink(file_path) - return hashlib.md5(sym_link_string.encode("utf-8")).hexdigest() - else: - with open(file_path, "rb") as file: - return hashlib.md5(file.read()).hexdigest() + # if is_symlink(file_path): + # # return the hash of the syftlink instead + # sym_link_string = convert_to_symlink(file_path) + # return hashlib.md5(sym_link_string.encode("utf-8")).hexdigest() + # else: + with open(file_path, "rb") as file: + return hashlib.md5(file.read()).hexdigest() def ignore_dirs(directory: str, root: str, ignore_folders=None) -> bool: @@ -679,17 +678,17 @@ def datasite_path(self) -> Path: def manifest_path(self) -> Path: return os.path.join(self.datasite_path, "public/manifest/manifest.json") - @property - def manifest(self) -> DatasiteManifest: - datasite_manifest = None - try: - datasite_manifest = DatasiteManifest.load(self.manifest_path) - except Exception: - datasite_manifest = DatasiteManifest.create_manifest( - path=self.manifest_path, email=self.email - ) + # @property + # def manifest(self) -> DatasiteManifest: + # datasite_manifest = None + # try: + # datasite_manifest = DatasiteManifest.load(self.manifest_path) + # except Exception: + # datasite_manifest = DatasiteManifest.create_manifest( + # path=self.manifest_path, email=self.email + # ) - return datasite_manifest + # return datasite_manifest def get_datasites(self: str) -> list[str]: datasites = [] @@ -699,71 +698,71 @@ def get_datasites(self: str) -> list[str]: datasites.append(folder) return datasites - def get_all_manifests(self): - manifests = {} - for datasite in get_datasites(self.sync_folder): - datasite_path = Path(self.sync_folder + "/" + datasite) - datasite_manifest = DatasiteManifest.load_from_datasite(datasite_path) - if datasite_manifest: - manifests[datasite] = datasite_manifest - return manifests - - def get_datasets(self): - manifests = self.get_all_manifests() - datasets = [] - for datasite, manifest in manifests.items(): - for dataset_name, dataset_dict in manifest.datasets.items(): - try: - dataset = TabularDataset(**dataset_dict) - dataset.syft_link = SyftLink(**dataset_dict["syft_link"]) - dataset.readme_link = SyftLink(**dataset_dict["readme_link"]) - dataset.loader_link = SyftLink(**dataset_dict["loader_link"]) - dataset._client_config = self - datasets.append(dataset) - except Exception as e: - print(f"Bad dataset format. {datasite} {e}") - - return DatasetResults(datasets) - - def get_code(self): - manifests = self.get_all_manifests() - all_code = [] - for datasite, manifest in manifests.items(): - for func_name, code_dict in manifest.code.items(): - try: - code = Code(**code_dict) - code.syft_link = SyftLink(**code_dict["syft_link"]) - code.readme_link = SyftLink(**code_dict["readme_link"]) - code.requirements_link = SyftLink(**code_dict["requirements_link"]) - code._client_config = self - all_code.append(code) - except Exception as e: - print(f"Bad dataset format. {datasite} {e}") - - return CodeResults(all_code) - - def resolve_link(self, link: SyftLink | str) -> Path: - if isinstance(link, str): - link = SyftLink.from_url(link) - return Path(os.path.join(os.path.abspath(self.sync_folder), link.sync_path)) + # def get_all_manifests(self): + # manifests = {} + # for datasite in get_datasites(self.sync_folder): + # datasite_path = Path(self.sync_folder + "/" + datasite) + # datasite_manifest = DatasiteManifest.load_from_datasite(datasite_path) + # if datasite_manifest: + # manifests[datasite] = datasite_manifest + # return manifests + + # def get_datasets(self): + # manifests = self.get_all_manifests() + # datasets = [] + # for datasite, manifest in manifests.items(): + # for dataset_name, dataset_dict in manifest.datasets.items(): + # try: + # dataset = TabularDataset(**dataset_dict) + # dataset.syft_link = SyftLink(**dataset_dict["syft_link"]) + # dataset.readme_link = SyftLink(**dataset_dict["readme_link"]) + # dataset.loader_link = SyftLink(**dataset_dict["loader_link"]) + # dataset._client_config = self + # datasets.append(dataset) + # except Exception as e: + # print(f"Bad dataset format. {datasite} {e}") + + # return DatasetResults(datasets) + + # def get_code(self): + # manifests = self.get_all_manifests() + # all_code = [] + # for datasite, manifest in manifests.items(): + # for func_name, code_dict in manifest.code.items(): + # try: + # code = Code(**code_dict) + # code.syft_link = SyftLink(**code_dict["syft_link"]) + # code.readme_link = SyftLink(**code_dict["readme_link"]) + # code.requirements_link = SyftLink(**code_dict["requirements_link"]) + # code._client_config = self + # all_code.append(code) + # except Exception as e: + # print(f"Bad dataset format. {datasite} {e}") + + # return CodeResults(all_code) + + # def resolve_link(self, link: SyftLink | str) -> Path: + # if isinstance(link, str): + # link = SyftLink.from_url(link) + # return Path(os.path.join(os.path.abspath(self.sync_folder), link.sync_path)) def use(self): os.environ["SYFTBOX_CURRENT_CLIENT"] = self.config_path os.environ["SYFTBOX_SYNC_DIR"] = self.sync_folder print(f"> Setting Sync Dir to: {self.sync_folder}") - @classmethod - def create_manifest(cls, path: str, email: str): - # make a dir and set the permissions - manifest_dir = os.path.dirname(path) - os.makedirs(manifest_dir, exist_ok=True) + # @classmethod + # def create_manifest(cls, path: str, email: str): + # # make a dir and set the permissions + # manifest_dir = os.path.dirname(path) + # os.makedirs(manifest_dir, exist_ok=True) - public_read = SyftPermission.mine_with_public_read(email=email) - public_read.save(manifest_dir) + # public_read = SyftPermission.mine_with_public_read(email=email) + # public_read.save(manifest_dir) - datasite_manifest = DatasiteManifest(datasite=email, file_path=path) - datasite_manifest.save(path) - return datasite_manifest + # datasite_manifest = DatasiteManifest(datasite=email, file_path=path) + # datasite_manifest.save(path) + # return datasite_manifest def create_folder(self, path: str, permission: SyftPermission): os.makedirs(path, exist_ok=True) @@ -781,6 +780,6 @@ def create_public_folder(self, path: str): public_read.save(full_path) return Path(full_path) - def publish(self, item, overwrite: bool = False): - if isinstance(item, Callable): - syftbox_code(item).publish(self, overwrite=overwrite) + # def publish(self, item, overwrite: bool = False): + # if isinstance(item, Callable): + # syftbox_code(item).publish(self, overwrite=overwrite) diff --git a/syftbox/server/server.py b/syftbox/server/server.py index d5762389..604512f5 100644 --- a/syftbox/server/server.py +++ b/syftbox/server/server.py @@ -193,7 +193,7 @@ async def get_ascii_art(): @app.get("/wheel/{path:path}", response_class=HTMLResponse) -async def browse_datasite(request: Request, path: str): +async def get_wheel(request: Request, path: str): if path == "": # Check if path is empty (meaning "/datasites/") return RedirectResponse(url="/") @@ -222,7 +222,7 @@ def get_file_list(directory="."): @app.get("/datasites", response_class=HTMLResponse) -async def browse_datasites(request: Request): +async def list_datasites(request: Request): datasite_path = os.path.join(SNAPSHOT_FOLDER) files = get_file_list(datasite_path) template_path = current_dir / "templates" / "datasites.html"