sourmash-bio · ccbaumler · Jun 14, 2024 · Jun 14, 2024 · Jun 16, 2024 · Jun 16, 2024
diff --git a/allthebacteria-workflow/README.md b/allthebacteria-workflow/README.md
@@ -0,0 +1,37 @@
+A sourmash database for the [AllTheBacteria database](https://doi.org/10.1101/2024.03.08.584059)
+
+The assembly files were all sketched to create a single sourmash database. 
+
+Example release:
+
+- https://ftp.ebi.ac.uk/pub/databases/AllTheBacteria/Releases/0.2/
+- https://github.com/AllTheBacteria/AllTheBacteria/releases/tag/v0.2
+
+## Executing 
+
+Note: To run the 5 sample test, simply write `test` in the `output_directory:` section of the config file. This must be run in an srun (no `--profile` works at this time).
+
+Create a list of assembly file paths at the ftp server
+```
+./scripts/ftp_link_list.py ftp.ebi.ac.uk pub/databases/AllTheBacteria/Releases/0.2/assembly -s ftp -p allthebacteria-r0.2-assembly-paths.txt
+```
+
+Run the snakemake workflow
+```
+snakemake -s allthebacteria.smk --use-conda --rerun-incomplete -j 10
+```
+> A rough estimate of resources required for completing this workflow:
+> set to run for 3 days with 10 cpus and 80 GB mem
+
+## Sanity check?
+
+By checking that the total amount of zip files created match the amount expected
+```
+find allthebacteria-r0.2-sigs/ -maxdepth 2 -type f -name "*.zip" -exec echo "{}" \; | wc -l
+```
+Returns the count of individual zip files that were created.
+For release 0.2, there were `665`.
+
+Which matches `cat allthebacteria-r0.2-assembly-paths.txt | wc -l`
+
+For renaming purposes `find ./ -type f -name 'missing.csv' | while read file; do     if [ $(wc -l < "$file") -gt 1 ]; then         echo "$file";     fi; done` to find the missing.csv with values
diff --git a/allthebacteria-workflow/envs/branchwater.yaml b/allthebacteria-workflow/envs/branchwater.yaml
@@ -0,0 +1,7 @@
+name: branchwater-smk
+channels:
+- conda-forge
+- bioconda
+- defaults
+dependencies:
+- sourmash_plugin_branchwater>0.9.5,<1
diff --git a/allthebacteria-workflow/envs/quarto.yaml b/allthebacteria-workflow/envs/quarto.yaml
@@ -0,0 +1,31 @@
+name: quarto-smk
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - quarto
+  - pandoc
+  - python=3.10 # for python support
+  - nodejs  # for JavaScript support
+  - r-base  # for R support
+  - r-essentials  # useful R packages
+  - jupyterlab  # for using Jupyter notebooks
+  - ipykernel  # for Jupyter kernel support
+
+  # Add any other dependencies you might need
+  - sourmash
+  - numpy
+  - pandas
+  - matplotlib
+  - scipy
+  - scikit-learn
+
+  # R packages installed via Conda
+  - r-reticulate # for python options in R code
+  - r-tidyverse
+  - r-shiny
+  - r-rmarkdown
+
+  # Add TeXLive for PDF output
+  - texlive-core
diff --git a/allthebacteria-workflow/envs/sourmash.yaml b/allthebacteria-workflow/envs/sourmash.yaml
@@ -0,0 +1,8 @@
+name: sourmash-smk
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - pandas>2,<3
+  - sourmash>4.8,<5
diff --git a/allthebacteria-workflow/scripts/extract_check.sh b/allthebacteria-workflow/scripts/extract_check.sh
@@ -0,0 +1,76 @@
+#!/bin/bash
+
+usage() {
+    echo "Usage: $0 -f DATA_FILE -o OUTPUT_PATH -d DIR_NAME -p DOWNLOAD_PATH -i DOWNLOAD_LINKS"
+    exit 1
+}
+
+while getopts "f:o:d:p:i:" opt; do
+    case $opt in
+        f) DATA_FILE="$OPTARG" ;;
+        o) OUTPUT_PATH="$OPTARG" ;;
+        d) DIR_NAME="$OPTARG" ;;
+        p) DOWNLOAD_PATH="$OPTARG" ;;
+        i) DOWNLOAD_LINKS="$OPTARG" ;;
+        *) usage ;;
+    esac
+done
+
+if [ -z "$DATA_FILE" ] || [ -z "$OUTPUT_PATH" ] || [ -z "$DIR_NAME" ] || [ -z "$DOWNLOAD_PATH" ] || [ -z "$DOWNLOAD_LINKS" ] ; then
+    usage
+fi
+
+download_file() {
+    echo "Downloading the file..."
+    LINK=$(grep "$DIR_NAME" "$DOWNLOAD_LINKS")
+    echo "From '$LINK'..."
+    wget -P "$DOWNLOAD_PATH" --continue --no-clobber --tries=3 --wait=1 -nv "$LINK"
+}
+
+extract_file() {
+    echo "Extracting the file..."
+    pv "$DATA_FILE" | tar --use-compress-program="xz -T0 -q" --skip-old-files -xf - -C "$OUTPUT_PATH"
+}
+
+if [ ! -f "$DATA_FILE" ]; then
+    echo "Data file not found."
+    download_file
+fi
+
+echo "Data file found. Extracting..."
+
+extract_file
+TAR_EXIT_CODE=$?
+
+# Check if the extraction failed due to an unexpected end of input
+if [ $TAR_EXIT_CODE -ne 0 ]; then
+    echo "Error encountered during extraction. Checking for specific error..."
+
+    if grep -q "Unexpected end of input" <<< "$(extract_file 2>&1)"; then
+        echo "Detected 'Unexpected end of input' error. Removing corrupted file and re-downloading..."
+
+        rm "$DATA_FILE"
+        rm -r "$OUTPUT_PATH"
+
+        # Re-download the file
+        download_file
+
+        # Try to extract the file again
+        echo "Extracting newly downloaded file..."
+        extract_file
+        TAR_EXIT_CODE=$?
+
+        if [ $TAR_EXIT_CODE -ne 0 ]; then
+            echo "Extraction failed again. Please check the file source or script for issues."
+            exit 1
+        else
+            echo "File extracted successfully after re-download."
+        fi
+    else
+        echo "Extraction failed due to a different error. Exiting."
+        exit 1
+    fi
+else
+    echo "File extracted successfully."
+fi
+
diff --git a/allthebacteria-workflow/scripts/extract_missing_files.py b/allthebacteria-workflow/scripts/extract_missing_files.py
@@ -0,0 +1,167 @@
+#! /usr/bin/env python
+
+import tarfile
+from collections import defaultdict
+import argparse
+import os
+import csv
+import sys
+
+def row_generator(filename):
+    with open(filename, 'rt') as fp:
+        print("Reading csv file content...")
+        for i, line in enumerate(fp, start=1):
+            if i % 1000 == 0:
+                print(f"Processing csv file: Line {i}", end='\r', flush=True)
+
+            line = line.strip().split(',')
+
+            # Skip header, create index for in_checkfile column, return only lines with true in index
+            if i == 1:
+                header = line
+                in_checkfile_index = header.index('in_checkfile')
+                continue
+
+            if line[in_checkfile_index].lower() == 'true':
+                yield line
+
+def line_processor(missed_list):
+    total = len(missed_list)
+    increments = total / 20 # ie 5% increments (100% / 5% = 20)
+    missed_dict = defaultdict(list)
+
+    for i, line in enumerate(missed_list):
+        if i % int(increments) == 0:
+            percentage = (i / total) * 100
+            print(f'Processing missing files: {i}/{total} ({percentage:.2f}%)', end='\r', flush=True)
+        tar_file = line[0]
+        seq_file = line[1]
+        missed_dict[tar_file].append(seq_file)
+
+    print('\n')
+    print(f'Processed missing files: {total}/{total} (100%)')
+    return missed_dict
+
+def load_info_dict(info_files=[]):
+    info_dict = {}
+    for info_file in info_files:
+        print(f'Parsing {info_file}...')
+        with open(info_file, 'r') as fp:
+            reader = csv.DictReader(fp, delimiter='\t')
+            for row in reader:
+                info_dict[row['sample']] = row['sample'] + " " + row['Contig_name']
+    return info_dict
+
+def manysketch_file(seq_dir, info_dict):
+        csv_file = os.path.join(seq_dir, 'manysketch.csv')
+        missing_file = os.path.join(seq_dir, 'missing.csv')
+        print(f"Writing 'manysketch.csv' and 'missing.csv' files for {seq_dir}")
+
+        with open(csv_file, 'w', newline='') as csvfile, open(missing_file, 'w') as missing_fp:
+             csvwriter = csv.writer(csvfile)
+             csvwriter.writerow(['name', 'genome_filename', 'protein_filename'])
+
+             filepaths = [os.path.join(seq_dir, f) for f in os.listdir(seq_dir)
+                          if os.path.isfile(os.path.join(seq_dir, f)) and
+                          f != os.path.basename(csv_file) and f != os.path.basename(missing_file)]
+             num_files = len(filepaths)
+
+             increment = int(num_files * 0.05)
+             target_line = increment
+
+             line_count = 0
+
+             for filepath in filepaths:
+                 filename_ext = os.path.basename(filepath)
+                 filename = os.path.splitext(filename_ext)[0]
+
+                 if filename in info_dict:
+                     name = info_dict[filename]
+                     name = name.replace(',', '')
+
+                     csvwriter.writerow([name, filepath,''])
+
+                     line_count += 1
+
+                     if line_count >= target_line:
+                         print(f"Progress: {line_count}/{num_files} lines written")
+                         target_line += increment
+                 else:
+                     if not filename.startswith('.'):
+                         missing_fp.write(f"{filename, filepath}\n")
+
+                         csvwriter.writerow([filename, filepath,''])
+
+                     line_count += 1
+
+                     if line_count >= target_line:
+                         print(f"Progress: {line_count}/{num_files} lines written")
+                         target_line += increment
+
+def main():
+    p = argparse.ArgumentParser()
+
+    p.add_argument('missed_files_csv', nargs='?', type=str, help='existing csv file created from find_missing_files.sh')
+    p.add_argument('-d', '--data-dir', nargs='?', type=str, help='The directory containing the archive files (must be in "tar.xz" format)')
+    p.add_argument('-o', '--output-dir', nargs='?', type=str, help='The name of a directory to store the seq files for processing')
+
+    args = p.parse_args()
+
+    missed_list = list(row_generator(args.missed_files_csv))
+
+    for i in range(6):
+        print(missed_list[i])
+
+    missed_dict = line_processor(missed_list)
+
+    for i, (k, v) in enumerate(missed_dict.items()):
+        if i >= 6:
+            break
+        print(f"Key {i+1}: {k}")
+        if isinstance(v, list) or isinstance(v, tuple):
+            for j, item in enumerate(v):
+                if j >= 6:
+                    break
+                print(f"    Value {j+1}: {item}")
+        else:
+            print(f"    Value: {v}")
+
+    total_keys = len(missed_dict)
+    total_values = sum(len(v) for v in missed_dict.values())
+
+    if os.path.exists(args.output_dir):
+         print(f"Storing files in {args.output_dir}")
+    else:
+         print(f"Creating and Storing files in {args.output_dir}")
+         os.makedirs(args.output_dir, exist_ok=True)
+
+    info_dict = load_info_dict([f"{os.path.dirname(args.missed_files_csv)}/sylph.tsv"])
+
+    for i, (k, v) in enumerate(missed_dict.items()):
+        print(f"Extracting sequence files from {k}: {i+1}/{total_keys}")
+        tar_path = os.path.join(args.data_dir, k)
+        dir_path = os.path.join(args.output_dir, os.path.dirname(v[0]))
+        print(tar_path, dir_path)
+
+        with tarfile.open(tar_path, 'r:xz') as tar:
+            for j, seq_file in enumerate(v):
+                #if j >= 6:
+                #    break
+                print(f"Extracting {seq_file}: {j+1}/{total_values}", end='\r', flush=True)
+
+                try:
+                    member = tar.getmember(seq_file)
+                    tar.extract(member, path=args.output_dir)
+                except KeyError:
+                    print(f"    {seq_file} not found in the archive {k}")
+
+            print('\n')
+            print(f'Extracted {j+1}/{total_values} from {k}')
+
+        manysketch_file(dir_path, info_dict)
+
+
+    print(f"Extracted files located in {args.output_dir}")
+
+if __name__ == '__main__':
+    sys.exit(main())