spear

#!/usr/bin/env python3

import argparse
import snakemake 
import os
import sys 
import subprocess
from Bio import SeqIO
from pathlib import Path
import re
from shutil import rmtree
from summarise_snpeff import parse_vcf, write_vcf
from summary_report import summary_report
from rich.console import Console
from rich.table import Table
from rich.progress import track
from rich.text import Text
from rich import box
import datetime
import pandas as pd
import io 

def get_representative_mutations(anno_file, lineage, cutoff = 0.85, mutation_list_only = False):
    anno_file = anno_file.loc[anno_file.lineage == lineage].copy()
    anno_file.residues = anno_file.residues.fillna("intergenic")
    anno_file["nt_aa_compound"] = anno_file.REF + anno_file.POS.astype("int").astype("str") + anno_file.ALT + "_" + anno_file.residues.fillna("intergenic")

    lineage_counts = anno_file.sample_id.nunique()
    anno_file["mutation_count"] = anno_file.loc[anno_file.lineage == lineage].groupby("nt_aa_compound").transform("size")
    
    if mutation_list_only:
        representative_mutations_df = anno_file.loc[anno_file.mutation_count >= (lineage_counts * cutoff), ["REF", "POS", "ALT", "description", "residues", "lineage", "mutation_count"]].drop_duplicates()
        representative_mutations_df.sort_values(by = "POS", inplace = True)
        representative_mutations_df["sample_count"] = lineage_counts
        return(representative_mutations_df, None)
    else:
        representative_mutations_df = anno_file.loc[anno_file.mutation_count >= (lineage_counts * cutoff), ["REF", "POS", "ALT", "description", "residues", "lineage", "mutation_count"]].drop_duplicates()
        representative_mutations_df["sample_count"] = lineage_counts
        
        representative_mutations_vcf = representative_mutations_df[["REF", "POS", "ALT"]].drop_duplicates().copy() #drop duplicates where genomic mutation alters multiple residues e.g. 106/7/8 del    
        #convert representative mutations into vcf format.
        representative_mutations_vcf["#CHROM"] = "NC_045512.2"
        representative_mutations_vcf["ID"] = representative_mutations_vcf["REF"] + representative_mutations_vcf["POS"].astype("int").astype("str") + representative_mutations_vcf["ALT"]
        representative_mutations_vcf["QUAL"] = "."
        representative_mutations_vcf["FILTER"] = "."
        representative_mutations_vcf["INFO"] = "."
        representative_mutations_vcf["FORMAT"] = "."
        representative_mutations_vcf["INFO"] = "AC=1;AN=1"
        representative_mutations_vcf["FORMAT"] = "GT"
        representative_mutations_vcf[lineage] = 1
        representative_mutations_vcf = representative_mutations_vcf[["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO", "FORMAT", lineage]]
        representative_mutations_vcf.sort_values(by = "POS", inplace = True)
        return(representative_mutations_df, representative_mutations_vcf)

def build_summary_table(input_samples, passing_samples, samples_fail_perc_n, samples_fail_count, cutoff, outfile, spear_version):
    table = Table(show_header=True, header_style="bold magenta", box = box.HORIZONTALS)
    table.add_column("Samples")
    table.add_column("Count")
    table.add_row("Input samples", str(len(input_samples)))
    table.add_row("[green]Passing Samples[/green]", Text(str(len(passing_samples)), "green"))
    table.add_row(f'[red]%N > {str(cutoff)}[/red]', Text(str(samples_fail_perc_n), "red"))
    table.add_row("Incorrect format", str(samples_fail_count))

    file_rows = [f"spear_version\tinput_samples\tpassing_samples\tsamples_fail_perc_n\tsamples_fail_count\tcutoff",
                 f"{spear_version}\t{str(len(input_samples))}\t{str(len(passing_samples))}\t{str(samples_fail_perc_n)}\t{str(samples_fail_count)}\t{str(cutoff)}"]

    with open(outfile, "w") as f:
        for row in file_rows:
            f.write(f"{row}\n")
            
    return table

def main():

    SPEAR_PATH = os.environ.get('CONDA_PREFIX')
    
    SPEAR_VERSION = "2.1.0"

    parser = argparse.ArgumentParser('spear')   

    subparsers = parser.add_subparsers(dest='command')
    consensus = subparsers.add_parser('consensus', help='Run SPEAR on consensus FASTA sequence (align first).')
    alignment = subparsers.add_parser('alignment', help='Run SPEAR on alignment in FASTA format (skip alignment).')  
    vcf = subparsers.add_parser('vcf', help='Run SPEAR on existing VCF file(s) - skip alignment and SNP/indel identification and ambiguous SNP filtering.') 
    update = subparsers.add_parser('update', help='Update [spear,data,all]') 
    representative = subparsers.add_parser('utilities-representative', help='Obtain representative mutations for a given set of sequences (requires SPEAR annotation file)') 
    demo = subparsers.add_parser('demo', help='Run SPEAR demo on lineage VCFs')
    report = subparsers.add_parser("utilities-report", help = "Generate HTML report from SPEAR output (requires SPEAR annotation file)")
    vcf_merge = subparsers.add_parser('utilities-vcf-merge', help='Merge VCFs from different lineages into a single VCF')

    consensus.add_argument('--debug', default = True, action='store_false',
        help="Verbose snakemake execution")
    consensus.add_argument('--dag', default=False, action='store_true',
        help = "Display DAG and exit")
    consensus.add_argument('--no-report', default=False, action='store_true',
        help = "Do not produce HTML report")
    consensus.add_argument('--tmp', default=False, action='store_true',
        help = "Preserve intermediate output files for debugging.")  
    consensus.add_argument('--extension', metavar = '', type = str,
        help = "Suffix and extension for input files")    
    consensus.add_argument('--mask-problem-sites', metavar = 'AB AM HA', nargs='+', 
        help = "Filter problematic sides with these codes: [AB AM HA HH HO IC NA NS NL SS AD BR all]")     
    consensus.add_argument('--threads' , metavar='', type = int, default = 1,
        help = "Max number of threads for snakemake job execution.")
    consensus.add_argument('--aligner', metavar = '', default = "minimap2", type = str,
        help = "Alignment method to use for alignment to SARS-CoV-2 reference, 'minimap2' or 'muscle', default minimap2")  
    #consensus.add_argument('--allowAmbiguous', default=False, action='store_true',
    #    help = "Toggle whether to exclude ambiguous bases in SNPs and insertions")
    consensus.add_argument('--cutoff', metavar = '', type = int , default=30,
        help = "Percentage N cutoff for input sequences. Default 30")
    consensus.add_argument('--global_n', metavar = '', type = float,
        help = "Minimum percentage of N in sample to flag as poor coverage. Default half of cutoff.")
    consensus.add_argument('--s_n', metavar = '', type = float, default = 0.05,
        help = "Minimum percentage of N in S gene to flag as poor coverage. Default 5.")
    consensus.add_argument('--s_contig', metavar = '', type = float, default = 150,
        help = "Minimum length of contig to flag sample as potential S gene dropout. Default 150nt")
    consensus.add_argument('--rbd_n', metavar = '', type = float, default = 12,
        help = "Number of N's in sample spike RBD to flag as poor. Default 12nt")
    consensus.add_argument('--window' , metavar='', type = int, default = 2,
        help = "Maximum number of flanking N's around deletion, default 2")
    consensus.add_argument('--baseline_scores' , metavar='', type = str,
        help = "Custom baseline scores file for use in summary report")
    consensus.add_argument('--baseline' , metavar='', type = str, default = "BQ.1",
        help = "Baseline sample to use, either from pre-loaded baseline scores or user-supplied custom baseline file. Default BQ.1.")
    consensus.add_argument('--no-product-plot', default=False, action='store_true',
        help = "Do not produce individual sample product plots (for fastest operation)")  
    consensus.add_argument('--pangolin', default="accurate", type = str,
        help = "Pangolin operation mode: accurate (UShER), fast (pangolearn), none (don't run pangolin)")  
    consensus.add_argument('--per_sample_outputs', default = False, action= "store_true",
      help ='Specify whether to include updated VCFs and sample level tsv outputs - false = quicker, default = False') 
    consensus.add_argument('--input', required = True, metavar='input', type=str,
        help='Input FASTA file (may be gzip compressed)')
    consensus.add_argument('--output', required = True, metavar='output', type=str,
        help='Destination dir for SPEAR annotated VCFs')
    

    alignment.add_argument('--debug', default = True, action='store_false',
        help="Verbose snakemake execution")
    alignment.add_argument('--dag', default=False, action='store_true',
        help = "Display DAG and exit")
    alignment.add_argument('--no-report', default=False, action='store_true',
        help = "Do not produce HTML report")     
    alignment.add_argument('--tmp', default=False, action='store_true',
        help = "Preserve intermediate output files for debugging.")        
    alignment.add_argument('--extension', metavar = '', type = str,
        help = "Suffix and extension for input files")
    alignment.add_argument('--mask-problem-sites', metavar = 'AB AM HA', nargs='+', 
        help = "Filter problematic sites with these codes: [AB AM HA HH HO IC NA NS NL SS AD BR all]") 
    alignment.add_argument('--threads' , metavar='', type = int, default = 1,
        help = "Max number of threads for snakemake job execution.")  
    #alignment.add_argument('--allowAmbiguous', default=False, action='store_true',
    #    help = "Toggle whether to exclude ambiguous bases in SNPs and insertions")
    alignment.add_argument('--cutoff', metavar = '', type = int , default=30,
        help = "Percentage N cutoff for input sequences. Default 30")
    alignment.add_argument('--global_n', metavar = '', type = float,
        help = "Minimum percentage of N in sample to flag as poor coverage. Default half of cutoff.")
    alignment.add_argument('--s_n', metavar = '', type = float, default = 0.05,
        help = "Minimum percentage of N in S gene to flag as poor coverage. Default 5.")
    alignment.add_argument('--s_contig', metavar = '', type = float, default = 150,
        help = "Minimum length of contig to flag sample as potential S gene dropout. Default 150nt")
    alignment.add_argument('--rbd_n', metavar = '', type = float, default = 12,
        help = "Number of N's in sample spike RBD to flag as poor. Default 12nt")
    alignment.add_argument('--window' , metavar='', type = int, default = 2,
        help = "Maximum number of flanking N's around deletion, default 2")
    alignment.add_argument('--baseline_scores' , metavar='', type = str,
        help = "Custom baseline scores file for use in summary report")
    alignment.add_argument('--baseline' , metavar='', type = str, default = "BQ.1",
        help = "Baseline sample to use, either from pre-loaded baseline scores or user-supplied custom baseline file. Default BQ.1.")
    alignment.add_argument('--no-product-plot', default=False, action='store_true',
        help = "Do not produce individual sample product plots (for fastest operation)")  
    alignment.add_argument('--pangolin', default="accurate", type = str,
        help = "Pangolin operation mode: accurate (UShER), fast (pangolearn), none (don't run pangolin)")        
    alignment.add_argument('--input', required = True, metavar='input', type=str,
        help='Input directory alignment file, or directory of alignments.')
    alignment.add_argument('--output', required = True, metavar='output', type=str,
        help='Destination dir for SPEAR annotated VCFs')
    alignment.add_argument('--per_sample_outputs', default = False, action= "store_true",
        help ='Specify whether to include updated VCFs and sample level tsv outputs - false = quicker') 

    vcf.add_argument('--debug', default = True, action='store_false',
        help="Verbose snakemake execution")
    vcf.add_argument('--extension', metavar = '', type = str,
        help = "Suffix and extension for input files")
    vcf.add_argument('--tmp', default=False, action='store_true',
        help = "Preserve intermediate output files for debugging.")        
    vcf.add_argument('--dag', default=False, action='store_true',
        help = "Display DAG and exit")
    vcf.add_argument('--no-report', default=False, action='store_true',
        help = "Do not produce HTML report")         
    vcf.add_argument('--mask-problem-sites', metavar = 'AB AM HA', nargs='+', 
        help = "Filter problematic sides with these codes [AB AM HA HH HO IC NA NS NL SS AD BR all]") 
    vcf.add_argument('--threads' , metavar='', type = int, default = 1,
        help = "Max number of threads for snakemake job execution.")
    vcf.add_argument('--baseline_scores' , metavar='', type = str,
        help = "Custom baseline scores file for use in summary report")
    vcf.add_argument('--baseline' , metavar='', type = str, default = "BQ.1",
        help = "Baseline sample to use, either from pre-loaded baseline scores or user-supplied custom baseline file. Default BQ.1.")
    vcf.add_argument('--no-product-plot', default=False, action='store_true',
        help = "Do not produce individual sample product plots (for fastest operation)")
    vcf.add_argument('--pangolin', default="accurate", type = str,
        help = "Pangolin operation mode: accurate (UShER), fast (pangolearn), none (don't run pangolin)")  
    vcf.add_argument('--input', required = True, metavar='input', type=str,
        help='Input VCF file - can be a single or multisample VCF. Prepare VCF directories into single file using "spear vcf-merge"')
    vcf.add_argument('--output', required = True, metavar='output', type=str,
        help='Destination dir for SPEAR annotated VCFs')
    vcf.add_argument('--per_sample_outputs', default = False, action= "store_true",
        help ='Specify whether to include updated VCFs and sample level tsv outputs - false = quicker') 
    
    update.add_argument('option', type = str,
        help="Update option : spear, all-data or all")
    
    representative.add_argument('--anno_file', metavar = '', type = str, default = "spear_annotation_summary.tsv")
    representative.add_argument('--lineage_file', metavar = '', type = str, default = "lineage_report.csv", 
                                help = "Lineage file from pangolin lineage assignment or user-defined. Must contain column headers taxon (sample_id in SPEAR annotation) and lineage.")
    representative.add_argument('--lineage', metavar = '', default = None, type = str)
    representative.add_argument('--cutoff', metavar = '', type = float, default = 0.85)
    representative.add_argument('--mutation_list_only', default=False, action='store_true')
    representative.add_argument('--output_dir', metavar = '', type = str, default = "representative_mutations",
                                help= "Output file name for representative mutations")
    representative.add_argument('--output_suffix', metavar = '', type = str, default = "representative_mutations",
                                help= "Output file name suffix")

    
    report.add_argument('--no-product-plot', default=False, action='store_true',
        help = "Do not produce individual sample product plots (for fastest operation)")
    report.add_argument('--score_summary', metavar = '', type = str, default = "spear_score_summary.tsv",
        help = "SPEAR score summary file")
    report.add_argument('--annotation_summary', metavar = '', type = str, default = "spear_annotation_summary.tsv",
        help = "SPEAR annotation file")
    report.add_argument('--baseline_scores' , metavar='', type = str, default = f'{SPEAR_PATH}/data/baseline_scores.tsv',
        help = "Baseline scores file for use in summary report")
    report.add_argument('--pangolin_report', metavar = '', type = str, default = "lineage_report.csv",
        help= "Lineage report file from pangolin lineage assignment or user-defined. Must contain column headers taxon (sample_id in SPEAR annotation) and lineage. If pangolin not run, pass empty file")
    report.add_argument('--baseline', metavar = '', type = str, default = "BQ.1",
        help = "Baseline sample to use, either from pre-loaded baseline scores or user-supplied custom baseline file. Default BQ.1.")
    report.add_argument('--pangolin_command', metavar = '', type = str, default = "pangolin_command.txt",
        help = "File containing pangolin command used to generate lineage report")
    report.add_argument("--output_dir", metavar = '', type = str, default = "report",
        help = "Output directory for report files")
    report.add_argument("--spear_params", metavar = '', type = str, default = "spear_params.txt",
        help = "File containing spear params used to generate original annotations")
    report.add_argument("--spear_qc_info", metavar = '', type = str, default = "spear_qc_info.tsv",
        help = "File containing spear qc info used to generate original annotations")
    report.add_argument("--n_perc", metavar = '', type = str, default = "qc.csv",
        help = "File containing spear n_perc qc used to generate original annotations")
    
    vcf_merge.add_argument('--input', metavar = '', type = str,
        help = "Input directory of VCFs to merge")
    vcf_merge.add_argument('--output', metavar = '', type = str, default = "merged.vcf",
        help = "Output VCF file name, default 'merged.vcf'")
    vcf_merge.add_argument('--out_dir', metavar = '', type = str, default = ".",
        help = "Output directory, default current directory")
    
    args = parser.parse_args()
    start_time = datetime.datetime.now()

    console = Console()
    grid = Table.grid()
    grid.add_column(justify = "center")
    
    
    logo = [
        Text.assemble(("/\\", "bold")), 
        Text.assemble(("/  \\", "bold")),
        Text.assemble(("/ ", "bold"), ("/\\", "bold red"), (" \\", "bold")), 
        Text.assemble(("/_", "bold"), ("/  \\", "bold red"), ("_\\", "bold")), 
        Text.assemble(("/ ", "bold red"), ("/\ ", "bold"), ("\\", "bold red")), 
        Text.assemble(("/_", "bold red"), ("/  \\", "bold"), ("_\\", "bold red")), 
        Text.assemble(("/    \\", "bold")), 
        Text.assemble(("/_    _\\", "bold")),
        Text.assemble(("|", "bold"), ("|", "bold red"),("|", "bold red"), ("|", "bold")), 
        Text.assemble(("|", "bold"), ("|", "bold red"),("|", "bold red"), ("|", "bold")),  
        Text.assemble(("SPEAR: Systematic ProtEin AnnotatoR v2.1.0", "bold red")),
        Text.assemble(("Matt Crown, Matt Bashton 2021-2024", "bold red"))]
    
    for item in logo:
        grid.add_row(item)
    console.print(grid)

    if args.command == "demo":
        args.output = "demo_out"
        args.command = "vcf"
        args.mask_problem_sites = None
        args.no_report = None
        args.baseline_scores = None
        args.baseline = "BQ.1"
        args.no_product_plot = None
        args.debug = True
        args.tmp = False
        args.dag = None
        args.threads = 1
        SPEAR_PATH = os.environ.get('CONDA_PREFIX')
        args.input = f'{SPEAR_PATH}/data/combined_lineages_example.vcf'
        args.extension = ".vcf"
        vcf_in = True
        args.allowAmbiguous = False
        args.window = False
        allow_ambiguous = False
        args.global_n = 1.0
        args.s_n = 1.0
        args.s_contig = 29903
        args.rbd_n = 5000
        args.aligner = None
        args.pangolin = "none"
        args.per_sample_outputs = True

    if args.command == "update":
        subprocess.run(['sh','update_spear.sh',args.option])

    elif args.command == "utilities-representative":
        #add some intro text to describe what is happening, sample counts etc.
        Path(f'{args.output_dir}').mkdir(parents=True, exist_ok=True)
        anno_file = pd.read_csv(args.anno_file, sep = '\t')
        in_samples = anno_file.sample_id.nunique()
        console.print(f"[bold green]Loaded {in_samples} samples from annotation file[/bold green]")
        lineage_file = pd.read_csv(args.lineage_file)
        lineage_file = lineage_file[["taxon", "lineage"]].copy()
        anno_file = anno_file.merge(lineage_file, left_on = "sample_id", right_on = "taxon", how = "left")
        if anno_file.lineage.isna().any():
            console.print("[red]Warning : some samples do not have a lineage assignment. These will be excluded from representative mutation analysis.[/red]")
            anno_file = anno_file.loc[~anno_file.lineage.isna()].copy()
        console.print(f"[bold green]Analysing {anno_file.sample_id.nunique()} samples with lineage assignment[/bold green]")
        lineage_sample_counts = anno_file.groupby("lineage").sample_id.nunique()
        if any(lineage_sample_counts < 5):
            console.print("[red]Warning : some lineages have less than 5 samples. These will be excluded from representative mutation analysis.[/red]")
            anno_file = anno_file.loc[anno_file.lineage.isin(lineage_sample_counts.index[lineage_sample_counts >= 5])].copy()
        if len(anno_file) == 0:
            console.print("[red]Error : no samples found with lineage assignment. Exiting.[/red]")
            sys.exit(1)
        representative_info = []
        if args.lineage != None:
            mutations, mutations_vcf = get_representative_mutations(anno_file, args.lineage, cutoff = args.cutoff, mutation_list_only = args.mutation_list_only)
            mutations.to_csv(f'{args.output_dir}/{args.lineage}.{args.output_suffix}.tsv', sep = '\t', index = False)
            if not args.mutation_list_only:
                mutations_header = ["##fileformat=VCFv4.2", f"##source=SPEAR Representative", "##contig=<ID=NC_045512.2,length=29903,md5=837a7a2974dd01bd66ab6a4acf830b68,URL=https://github.com/m-crown/SPEAR/blob/main/data/reference.fasta>",
'##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">','##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">',
'##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">']
                write_vcf(mutations_header , mutations_vcf, f"{args.output_dir}/{args.lineage}.{args.output_suffix}.vcf")
                if len(mutations) != 0:
                    representative_info.append([args.lineage, str(mutations.sample_count.head(1).values[0]), str(len(mutations))])
                else:
                    representative_info.append([args.lineage, "no mutations identified", "not mutations identified"])
        else:
            for lineage in anno_file.lineage.unique():
                mutations,mutations_vcf = get_representative_mutations(anno_file, lineage, cutoff = args.cutoff, mutation_list_only = args.mutation_list_only)
                mutations.to_csv(f'{args.output_dir}/{lineage}_{args.output_suffix}.tsv', sep = '\t', index = False)
                if not args.mutation_list_only:
                    mutations_header = ["##fileformat=VCFv4.2", f"##source=SPEAR Representative", "##contig=<ID=NC_045512.2,length=29903,md5=837a7a2974dd01bd66ab6a4acf830b68,URL=https://github.com/m-crown/SPEAR/blob/main/data/reference.fasta>",
    '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">','##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count in genotypes, for each ALT allele, in the same order as listed">',
    '##INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">']
                    write_vcf(mutations_header , mutations_vcf, f"{args.output_dir}/{lineage}.{args.output_suffix}.vcf")
                if len(mutations) != 0:
                    representative_info.append([lineage, str(mutations.sample_count.head(1).values[0]), str(len(mutations))])
                else:
                    representative_info.append([lineage, "no mutations identified", "not mutations identified"])
        console.print("[bold green]Representative mutations generated[/bold green]")

        table = Table(show_header=True, header_style="bold magenta", title = "Representative Mutations Summary", caption = "Representative mutations identified in each lineage, with min {args.cutoff} lineage presence", caption_justify = "center")
        mutations = mutations.applymap(str)
        table.add_column("Lineage")
        table.add_column("# Samples")
        table.add_column("# Representative Mutations")
        for row in representative_info:
            table.add_row(*row)
        with open(f'{args.output_dir}/representative_mutations_summary.txt', 'w') as f:
            f.write("Lineage,samples,num_representative_mutations\n")
            for row in representative_info:
                f.write(f'{",".join(row)}\n')
        console.print(table)

    ##add vcf merge command - runs the find and bcftools.
    elif args.command == "utilities-vcf-merge":
        #add some intro text to describe what is happening, sample counts etc.
        console.print("[bold green]Merging VCFs[/bold green]")
        #run find command and send output to merge_list.txt
        #add some more qc steps here ?
        with open(f'{args.out_dir}/merge_list.txt', 'w') as f:
            subprocess.run(['find', f'{args.input}', '-name', '*.vcf'], stdout=f, check=True)
        #run bcftools merge command
        subprocess.run(['bcftools', 'merge', '--no-index', '-m', 'none', '-o', f'{args.out_dir}/{args.output}', '-l', f'{args.out_dir}/merge_list.txt'], check=True)
        

    elif args.command == "utilities-report":
        #add some intro text to describe what is happening, sample counts etc.
        console.print("[bold green]Generating HTML report[/bold green]")


        args.images_dir = f'{SPEAR_PATH}/images'
        args.scripts_dir = f'{SPEAR_PATH}/bin'
        args.data_dir = f'{SPEAR_PATH}/data'
        if not args.no_product_plot:
            args.product_plots = False
        else:
            args.product_plots = True
        
        Path(f'{args.output_dir}').mkdir(parents=True, exist_ok=True)

        summary_report(args)

    elif args.command not in ["alignment", "consensus", "vcf", "demo", "utilities-representative", "utilties-report", "utilities-vcf-merge"]:
        parser.print_help()
        console.print("Please select a subcommand (choose from 'consensus', 'alignment', 'vcf', 'utilities-representative', 'update', 'demo', 'utilities-report', 'utilities-vcf-merge')")
        sys.exit(1)

    else:
        if args.command == "consensus":
            align_in = True
        else:
            align_in = False
            args.aligner = None
        if args.command == "vcf":
            vcf_in = True
            args.allowAmbiguous = False
            args.window = False
            allow_ambiguous = False
            args.global_n = 1.0
            args.s_n = 1.0
            args.s_contig = 29903
            args.rbd_n = 5000
        else:
            vcf_in = False

        SPEAR_PATH = os.environ.get('CONDA_PREFIX')
        args.input = args.input.rstrip("/")
        args.output = args.output.rstrip("/")
        spear_qc_info = f'{args.output}/spear_qc_info.tsv'
        #check if input is a directory or a file and then operate differently depending
        if args.extension == None:
            if args.command == "consensus":
                extension = ".fa"
            elif args.command == "vcf":
                extension = ".vcf"
            else:
                extension = ".aln"
        else:
            extension = args.extension

        passing_samples = []
        Path(f'{args.output}/input_files').mkdir(parents=True, exist_ok=True)
        if args.command == "consensus":
            #if directory, fail with info 
            if os.path.isdir(args.input):
                console.print("[red]Error : input is a directory, please provide a single file.[\red]")
                sys.exit(1)
            #input should be a single file, regardless of number of samples. file can be gzipped or not.
            #verify input is a file not a dir
            nperc = subprocess.run(['seqkit', 'fx2tab', f'{args.input}', '-B', 'N', '--name'], stdout=subprocess.PIPE).stdout.decode('utf-8').rstrip('\n')
            input_samples = pd.read_csv(io.StringIO(nperc), sep='\t', names = ["sample_id", "perc_n"])
            samples_fail_perc_n = len(input_samples.loc[input_samples.perc_n >= args.cutoff])
            input_samples["sample_id_clean"] = input_samples["sample_id"].str.replace('[^a-zA-Z0-9.]', '_', regex = True)
            passing_samples = input_samples.loc[input_samples.perc_n < args.cutoff]
            
            passing_samples[["sample_id", "sample_id_clean"]].to_csv(f"{args.output}/input_files/pattern_file.tsv", sep = "\t", header = None, index = False)
            passing_samples[["sample_id"]].to_csv(f"{args.output}/input_files/name_pattern_file.tsv", sep = "\t", header = None, index = False)
            passing_samples = passing_samples.sample_id_clean.values.tolist()
            if len(passing_samples) == 1:
                    single_sample = True
            elif len(passing_samples) > 1:
                    single_sample = False
            else:
                console.print("[red]No samples to process, exiting.[\red]")
                sys.exit(1)
            grep_command = ['seqkit', 'grep', '--quiet', '-f', f'{args.output}/input_files/name_pattern_file.tsv', f'{args.input}']
            replace_command = [
                'seqkit', 'replace', '--quiet',
                '-p', '(.+)$',
                '-k', f'{args.output}/input_files/pattern_file.tsv',
                '-r', '{kv}',
                '-o', f'{args.output}/input_files/input.fasta.gz'
                ]
            
            grep_process = subprocess.Popen(grep_command, stdout=subprocess.PIPE)
            replace_process = subprocess.run(replace_command, stdin=grep_process.stdout, check=True)
            grep_process.stdout.close()
            grep_process.wait()
            samples_fail_count = 0 #only relevent when looking at non concat fa files - where ref may exist
            table = build_summary_table(input_samples, passing_samples, samples_fail_perc_n, samples_fail_count, args.cutoff, spear_qc_info, SPEAR_VERSION)

        elif args.command == "vcf":
            if os.path.isdir(args.input):
                console.print("[red]Error : input is a directory, please provide a single file, or use spear vcf-merge command to produce a single vcf from a vcf directory.[\red]")
                sys.exit(1)
            header , vcf = parse_vcf(f'{args.input}', split_info_cols = False)
            sample_names_old = vcf.columns[9:]
            if len(sample_names_old) == 0:
                parser.print_help()
                console.print("[red]Error : no samples found[\red]")
                sys.exit(1)
            elif len(sample_names_old) == 1:
                single_sample = True
            else:
                single_sample = False

            sample_names = sample_names_old.str.replace('[^a-zA-Z0-9]', '_')
            input_samples = sample_names.values.tolist()
            colnames = vcf.columns[0:9].values.tolist() + sample_names.values.tolist()
            vcf.columns = colnames
            if vcf["#CHROM"].str.contains("NC_045512\.2|MN908947\.3").all():
                Path(f'{args.output}/input_files').mkdir(parents=True, exist_ok=True)
                vcf["#CHROM"] = "NC_045512.2"
                write_vcf(header, vcf, f'{args.output}/input_files/input.vcf')
                passing_samples = sample_names.values.tolist()
            else:
                samples_fail_chrom_name = sample_names #however with combined file this is all
            if len(passing_samples) == 0:
                console.print("[red]No samples found with matching CHROM field, exiting.[\red]")
                sys.exit(1)      
            samples_fail_perc_n = "Not applicable, VCF input"
            samples_fail_count = "Not applicable, VCF input"
            table = build_summary_table(input_samples, passing_samples, samples_fail_perc_n, samples_fail_count, "NA", spear_qc_info, SPEAR_VERSION)
        elif args.command == "alignment":
            if os.path.isfile(args.input):
                single_sample = True
                passing_samples = []
                samples_fail_perc_n = 0
                samples_fail_count = 0
                samples_fail_no_ref = 0
                count = 0
                sample_file = list(SeqIO.parse(f'{args.input}', "fasta"))
                found_ref = False
                ref_regex = re.compile(r'NC_045512\.2|MN908947\.3')
                for record in sample_file:
                    count +=1
                    if ref_regex.search(record.id):
                        record.id = "NC_045512.2"
                        found_ref = True
                        continue
                    elif len(record.seq) == 0:
                        perc_n = 100
                    else:
                        perc_n = (record.seq.count("N")/len(record.seq)) * 100
                if count != 2:
                    samples_fail_count += 1
                elif perc_n >= float(args.cutoff):
                    samples_fail_perc_n += 1
                elif not found_ref:
                    samples_fail_no_ref += 1
                else:
                    sample_file[1].id = re.sub('[^a-zA-Z0-9\.]', '_', sample_file[1].id)
                    sample_file[1].name = re.sub('[^a-zA-Z0-9\.]', '_', sample_file[1].name)
                    sample_file[1].description = re.sub('[^a-zA-Z0-9\.]', '_', sample_file[1].description)
                    with open(f'{args.output}/input_files/{sample_file[1].id}{extension}', "w") as output_handle:
                        SeqIO.write(sample_file, output_handle, "fasta")
                    passing_samples = [sample_file[1].id]
            else:
                single_sample = False
                input_samples = [(f.split(f'{extension}'))[0] for f in os.listdir(args.input) if f.endswith(extension)]
                passing_samples = []
                samples_fail_perc_n = 0
                samples_fail_count = 0
                samples_fail_no_ref = 0
                for sample in input_samples:
                    count = 0
                    sample_file = list(SeqIO.parse(f'{args.input}/{sample}{extension}', "fasta"))
                    found_ref = False
                    ref_regex = re.compile(r'NC_045512\.2|MN908947\.3')
                    for record in sample_file:
                        count +=1
                        if ref_regex.search(record.id):
                            record.id = "NC_045512.2"
                            found_ref = True
                            continue
                        elif len(record.seq) == 0:
                            perc_n = 100
                        else:
                            perc_n = (record.seq.count("N")/len(record.seq)) * 100
                    if count != 2:
                        samples_fail_count += 1
                    elif perc_n >= args.cutoff:
                        samples_fail_perc_n += 1
                    elif not found_ref:
                        samples_fail_no_ref += 1
                    else:
                        sample_file[1].id = re.sub('[^a-zA-Z0-9\.]', '_', sample_file[1].id)
                        sample_file[1].name = re.sub('[^a-zA-Z0-9\.]', '_', sample_file[1].name)
                        sample_file[1].description = re.sub('[^a-zA-Z0-9\.]', '_', sample_file[1].description)
                        with open(f'{args.output}/input_files/{sample_file[1].id}{extension}', "w") as output_handle:
                            SeqIO.write(sample_file, output_handle, "fasta")
                        passing_samples.append(sample_file[1].id)
            table = build_summary_table(input_samples, passing_samples, samples_fail_perc_n, samples_fail_count, args.cutoff, spear_qc_info, SPEAR_VERSION)
            if len(passing_samples) == 1:
                single_sample = True
            elif len(passing_samples) == 0:
                console.print("[red]No samples to process, exiting.[/red]")
                sys.exit(1)
        else:
            parser.print_help()
            print("Error : invalid command")
            sys.exit(1)

        # if args.allowAmbiguous:
        #     exclude = ""
        #     allow_ambiguous = "--allowAmbiguous"
        # else:
        exclude = "-ambiguousToN"
        allow_ambiguous = ""
        problem_sites = args.mask_problem_sites

        problem_exc = { 
            "AB" : "ambiguous", 
            "AM" : "amended",
            "HA": "highly_ambiguous", 
            "HH" : "highly_homoplasic", 
            "HO" : "homoplasic", 
            "IC" : "interspecific_contamination", 
            "NA" : "nanopore_adapter", 
            "NS": "narrow_src", 
            "NL": "neighbour_linked", 
            "SS": "single_src", 
            "AD": "amplicon_drop_or_primer_artefact", 
            "BR": "back_to_ref"}

        exclusion_statements = []
        if problem_sites == None:
            filter_snps = False
            filter_statement = ""
        else:
            filter_snps = True
            if "all" in problem_sites:
                filter_statement = f"problem_filter = 'mask'"
            else:
                for site in problem_sites:
                    try:
                        problem_exc[site]
                        statement = f" problem_exc =~ '{problem_exc[site]}' "
                        exclusion_statements.append(statement)

                    except KeyError:
                        print("Masking parameter not recognised")
                        parser.print_help()
                        sys.exit(1)
                exclusions = "|".join(exclusion_statements)
                filter_statement = f"problem_filter = 'mask' & ({exclusions})"

        snakefile = f'{SPEAR_PATH}/bin/pipeline.smk'
        if single_sample: 
            singlesample = "True"
        else:
            singlesample = "False"

        if args.no_report == True:
            report = False
        else:
            report = True

        if args.baseline_scores:
            if os.path.isfile(args.baseline_scores):
                baseline_scores = args.baseline_scores
                baseline_df = pd.read_csv(baseline_scores, sep = '\t')
            else:
                console.print("Error, user specified baseline scores file not found. Exiting.")
                sys.exit(1)
        else:
            baseline_scores = f'{SPEAR_PATH}/data/baseline_scores.tsv'
            baseline_df = pd.read_csv(baseline_scores, sep = '\t')

        scores_cols = [
            "sample_id",	
            "total_variants",
            "total_residue_variants",
            "consequence_type_variants",
            "region_residues",
            "domain_residues",
            "feature_residues",
            "ACE2_contact_counts",
            "ACE2_contact_score",
            "trimer_contact_counts",
            "trimer_contact_score",
            "barnes_class_variants",
            "bloom_ACE2_wuhan_mean",
            "bloom_ACE2_wuhan_max",
            "bloom_ACE2_wuhan_min",
            "bloom_ACE2_BA1_mean",
            "bloom_ACE2_BA1_max",
            "bloom_ACE2_BA1_min",
            "bloom_ACE2_BA2_mean",
            "bloom_ACE2_BA2_max",
            "bloom_ACE2_BA2_min",
            "VDS_mean",
            "VDS_max",
            "VDS_min",
            "serum_escape_sum",
            "serum_escape_max",
            "serum_escape_min",
            "mAb_escape_all_classes_sum",
            "mAb_escape_all_classes_max",
            "mAb_escape_all_classes_min",
            "cm_mAb_escape_all_classes_sum",
            "cm_mAb_escape_all_classes_max",
            "cm_mAb_escape_all_classes_min",
            "mAb_escape_class_1_sum",
            "mAb_escape_class_1_max",
            "mAb_escape_class_1_min",
            "mAb_escape_class_2_sum",
            "mAb_escape_class_2_max",
            "mAb_escape_class_2_min",
            "mAb_escape_class_3_sum",
            "mAb_escape_class_3_max",
            "mAb_escape_class_3_min",
            "mAb_escape_class_4_sum",
            "mAb_escape_class_4_max",
            "mAb_escape_class_4_min",
            "BEC_EF_sample"]

        if baseline_df.columns.tolist() != scores_cols:
            console.print("Error, baseline scores file format incorrect, are you missing a field from spear_scores_summary.tsv ? Exiting.")
            sys.exit(1)

        if baseline_df["sample_id"].isin([args.baseline]).any():
            baseline = args.baseline
        else:
            console.print("Error, user specified baseline sample not found in baseline file. Exiting.")
            sys.exit(1)
        
        if single_sample:
            input_sample_num = 1
        else:
            input_sample_num = len(input_samples)
        qc_sample_num = len(passing_samples)

        if args.no_product_plot:
            product_plot = ""
        else:
            product_plot = "--product_plots"

        if args.command in ["consensus", "alignment"]:
            if not args.global_n:
                args.global_n = (args.cutoff/100) * 0.5
        
        if args.command == "vcf":
            cutoff = 0.3
        else:
            cutoff = args.cutoff/100

        if args.per_sample_outputs == True:
            per_sample_outputs = "True"
        else:
            if single_sample:
                per_sample_outputs = "True"
            else:
                per_sample_outputs = "False"


        spear_params = f'spear:{args.command},mask-problem-sites:{problem_sites},aligner:{args.aligner},cutoff:{cutoff},global_n:{args.global_n},s_n:{args.s_n},s_contig:{args.s_contig},rbd_n:{args.rbd_n},window:{args.window}'
        spear_params_file = f'{args.output}/spear_params.txt'
        with open(f'{spear_params_file}', 'w') as f:
            f.write(spear_params)
        
        #save the passing samples to a csv file for use in pipeline
        with open(f'{args.output}/passing_samples.csv', 'w') as f:
            for sample in passing_samples:
                f.write(f'{sample}\n')

        args.s_contig = int(args.s_contig)
        config = {
            "samples" : passing_samples, 
            "output_dir" : args.output,
            "data_dir" : f'{SPEAR_PATH}/data',
            "align" : align_in,
            "aligner" : args.aligner,
            "exclude_ambiguous" : exclude,
            "reference_sequence" : f'{SPEAR_PATH}/data/reference.fasta',
            "filter" : filter_snps,
            "filter_params" : filter_statement,
            "del_window" : args.window,
            "extension" : extension,
            "vcf" : vcf_in,
            "single_sample" : single_sample,
            "allow_ambiguous" : allow_ambiguous,
            "report": report,
            "images_dir" : f'{SPEAR_PATH}/images',
            "scripts_dir" : f'{SPEAR_PATH}/bin',
            "baseline" : baseline,
            "baseline_scores": baseline_scores,
            "input_sample_num" : input_sample_num,
            "qc_sample_num" : qc_sample_num,
            "product_plots" : product_plot,
            "spear_qc_info" : spear_qc_info,
            "pangolin" : args.pangolin,
            "threads" : args.threads,
            "max_n" : cutoff, 
            "spear_params" : spear_params_file,
            "per_sample_outputs" : per_sample_outputs,
            "single_sample" : singlesample}
        
        console.print(table)

        status = snakemake.snakemake(
                    snakefile, 
                    printshellcmds=False,
                    config=config, 
                    quiet= args.debug,
                    forceall = False, 
                    printdag = args.dag, 
                    cores = args.threads,
                    printreason = False)

        if status:
            end_time = datetime.datetime.now()
            c = end_time - start_time
            minutes = int(c.total_seconds() // 60)
            seconds = c.total_seconds() % 60
            
            console.print(f'Analysis complete! {str(len(passing_samples))} samples analysed in {str(minutes)} mins {str(round(seconds,2))} secs. :white_check_mark:')
            
            c = end_time - start_time

            if not args.tmp and not args.dag:
                rmtree(f'{args.output}/intermediate_output/')
                rmtree(f'{args.output}/input_files/')
            return 0
        return 1


if __name__ == "__main__":
    main()