Skip to content

Commit

Permalink
Merge pull request #40 from OpenMined/madhava/syftignore
Browse files Browse the repository at this point in the history
Added support for .syftignore files (simple prefix matching)
  • Loading branch information
madhavajay authored Oct 7, 2024
2 parents 62cc93e + 57dab3a commit 674e541
Show file tree
Hide file tree
Showing 6 changed files with 291 additions and 136 deletions.
32 changes: 32 additions & 0 deletions default_apps/adder/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import json
import os

from syftbox.lib import ClientConfig

config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None)
client_config = ClientConfig.load(config_path)

input_folder = (
f"{client_config.sync_folder}/{client_config.email}/app_pipelines/adder/inputs/"
)
output_folder = (
f"{client_config.sync_folder}/{client_config.email}/app_pipelines/adder/done/"
)
os.makedirs(input_folder, exist_ok=True)
os.makedirs(output_folder, exist_ok=True)

input_file_path = f"{input_folder}data.json"
output_file_path = f"{output_folder}data.json"

if os.path.exists(input_file_path):
with open(input_file_path, "r") as f:
data = json.load(f)

data["datum"] += 1

with open(output_file_path, "w") as f:
json.dump(data, f)

os.remove(input_file_path)
else:
print(f"Input file {input_file_path} does not exist.")
20 changes: 10 additions & 10 deletions default_apps/github_app_updater/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,15 +50,15 @@ should_update() {
repo_name="$1"
update_frequency="$2"
last_update=$(get_last_update "$repo_name")

# If there's no last update time, it's time to update
if [ -z "$last_update" ]; then
return 0
fi

current_time=$(date +%s)
time_since_last_update=$((current_time - last_update))

# Check if enough time has passed since the last update
[ "$time_since_last_update" -ge "$update_frequency" ]
}
Expand All @@ -69,27 +69,27 @@ while IFS=',' read -r repo_url update_frequency update_type || [ -n "$repo_url"
repo_url=$(echo "$repo_url" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
update_frequency=$(echo "$update_frequency" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')
update_type=$(echo "$update_type" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')

# Skip empty lines
if [ -z "$repo_url" ]; then
continue
fi

# Extract the repository name from the URL
repo_name=$(echo "$repo_url" | sed -e 's/.*\///' -e 's/\.git$//')
repo_path="../$repo_name"

# Check if it's time to update this repository
if ! should_update "$repo_name" "$update_frequency"; then
echo "Skipping $repo_name, not time to update yet"
continue
fi

echo "Processing repository: $repo_name"
echo "Local path: $repo_path"
echo "Update frequency: every $update_frequency seconds"
echo "Update type: $update_type"

# Check if the repository already exists
if [ -d "$repo_path" ]; then
if [ "$update_type" = "REPLACE" ]; then
Expand Down Expand Up @@ -132,10 +132,10 @@ while IFS=',' read -r repo_url update_frequency update_type || [ -n "$repo_url"
echo "Failed to clone $repo_name"
fi
fi

# Update the last update time for this repository
set_last_update "$repo_name"

echo "-----------------------------------"
done < "github_apps.csv"

Expand Down
45 changes: 45 additions & 0 deletions default_apps/logged_in/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import os
from datetime import datetime

from syftbox.lib import ClientConfig


def main():
# Load the client configuration
config_path = os.environ.get("SYFTBOX_CLIENT_CONFIG_PATH", None)
client_config = ClientConfig.load(config_path)

# Get the current timestamp
current_timestamp = datetime.now().isoformat()

# Prepare the data to be written
timestamp_data = {"last_check_in": current_timestamp}

# Prepare output folders
output_folder = f"{client_config.sync_folder}/{client_config.email}/app_pipelines/timestamp_recorder/"
os.makedirs(output_folder, exist_ok=True)

# Write timestamp to output file
output_file_path = f"{output_folder}last_check_in.json"
with open(output_file_path, "w") as f:
json.dump(timestamp_data, f, indent=2)

# Write _.syftperm file
syftperm_data = {
"admin": [client_config.email],
"read": ["GLOBAL"],
"write": [client_config.email],
"filepath": f"{output_folder}_.syftperm",
"terminal": False,
}
syftperm_path = f"{output_folder}_.syftperm"
with open(syftperm_path, "w") as f:
json.dump(syftperm_data, f, indent=2)

print(f"Timestamp has been written to {output_file_path}")
print(f"_.syftperm file has been written to {syftperm_path}")


if __name__ == "__main__":
main()
95 changes: 87 additions & 8 deletions syftbox/client/plugins/sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from collections import defaultdict
from datetime import datetime
from threading import Event
from typing import Tuple

import requests
from watchdog.events import DirModifiedEvent
Expand All @@ -26,6 +27,49 @@
IGNORE_FOLDERS = [CLIENT_CHANGELOG_FOLDER, STAGING, CLIENT_APPS]


def get_ignore_rules(dir_state: DirState) -> Tuple[str, str, str]:
# get the ignore files
syft_ignore_files = []
folder_path = dir_state.sync_folder + "/" + dir_state.sub_path
for afile, file_info in dir_state.tree.items():
full_path = folder_path + "/" + afile
sub_folder = os.path.dirname(full_path)

if afile.endswith(".syftignore") and os.path.isfile(full_path):
ignore_list = []
with open(full_path) as f:
ignore_list = f.readlines()
for ignore_rule in ignore_list:
ignore_rule = ignore_rule.strip()
rule_prefix = sub_folder + "/" + ignore_rule
syft_ignore_files.append((rule_prefix, sub_folder, afile))

return syft_ignore_files


def filter_ignore_files(dir_state: DirState) -> DirState:
# get the ignore files
pruned_tree = dir_state.tree.copy()
folder_path = dir_state.sync_folder + "/" + dir_state.sub_path
syft_ignore_files = get_ignore_rules(dir_state)

for rule_prefix, ignore_folder, ignore_file_path in syft_ignore_files:
for afile, file_info in dir_state.tree.items():
full_path = folder_path + "/" + afile
if full_path.startswith(rule_prefix):
# print("> File ignored by .syftignore", afile, ignore_rule)
if afile in pruned_tree:
del pruned_tree[afile]

now = datetime.now().timestamp()
return DirState(
tree=pruned_tree,
timestamp=now,
sync_folder=dir_state.sync_folder,
sub_path=dir_state.sub_path,
)


# Recursive function to add folder structure
def add_to_folder_tree(leaf, parts):
if not parts:
Expand Down Expand Up @@ -360,6 +404,20 @@ def handle_empty_folders(client_config, datasite):
return changes


def filter_changes_ignore(pre_filter_changes, syft_ignore_files):
filtered_changes = []
for change in pre_filter_changes:
keep = True
for syft_ignore in syft_ignore_files:
if change.full_path.startswith(syft_ignore[0]):
keep = False
break
if keep:
filtered_changes.append(change)

return filtered_changes


def sync_up(client_config):
# create a folder to store the change log
change_log_folder = f"{client_config.sync_folder}/{CLIENT_CHANGELOG_FOLDER}"
Expand Down Expand Up @@ -399,15 +457,25 @@ def sync_up(client_config):
)

# get the new dir state
new_dir_state = hash_dir(client_config.sync_folder, datasite, IGNORE_FOLDERS)
changes = diff_dirstate(old_dir_state, new_dir_state)
unfiltered_new_dir_state = hash_dir(
client_config.sync_folder, datasite, IGNORE_FOLDERS
)

# ignore files
syft_ignore_files = get_ignore_rules(unfiltered_new_dir_state)
new_dir_state = filter_ignore_files(unfiltered_new_dir_state)

pre_filter_changes = diff_dirstate(old_dir_state, new_dir_state)

# Add handling for empty folders
empty_folder_changes = handle_empty_folders(client_config, datasite)
changes.extend(empty_folder_changes)
pre_filter_changes.extend(empty_folder_changes)

changes = filter_changes_ignore(pre_filter_changes, syft_ignore_files)

if len(changes) == 0:
continue

val, val_files, inval = filter_changes(client_config.email, changes, perm_tree)

# send val changes
Expand Down Expand Up @@ -466,17 +534,27 @@ def sync_down(client_config) -> int:
# perm_tree = PermissionTree.from_path(datasite_path)

# get the new dir state
new_dir_state = hash_dir(client_config.sync_folder, datasite, IGNORE_FOLDERS)

unfiltered_new_dir_state = hash_dir(
client_config.sync_folder, datasite, IGNORE_FOLDERS
)
syft_ignore_files = get_ignore_rules(unfiltered_new_dir_state)

# ignore files
new_dir_state = filter_ignore_files(unfiltered_new_dir_state)

remote_dir_state = get_remote_state(client_config, datasite)
if not remote_dir_state:
# print(f"No remote state for dir: {datasite}")
continue

changes = diff_dirstate(new_dir_state, remote_dir_state)
pre_filter_changes = diff_dirstate(new_dir_state, remote_dir_state)

# Add handling for empty folders
empty_folder_changes = handle_empty_folders(client_config, datasite)
changes.extend(empty_folder_changes)
pre_filter_changes.extend(empty_folder_changes)

changes = filter_changes_ignore(pre_filter_changes, syft_ignore_files)

if len(changes) == 0:
continue
Expand Down Expand Up @@ -527,11 +605,12 @@ def sync_down(client_config) -> int:
synced_dir_state = prune_invalid_changes(new_dir_state, changed_files)

# combine successfulc hanges qwith old dir state
combined_tree = new_dir_state.tree
# we use unfiltered so they keep being ignored but we could change these to another list?
combined_tree = unfiltered_new_dir_state.tree
combined_tree.update(synced_dir_state.tree)
synced_dir_state.tree = combined_tree

synced_dir_state = delete_files(new_dir_state, deleted_files)
synced_dir_state = delete_files(synced_dir_state, deleted_files)

change_text = ""
if len(changed_files):
Expand Down
Loading

0 comments on commit 674e541

Please sign in to comment.