Skip to content

Commit

Permalink
filter snippy multi
Browse files Browse the repository at this point in the history
  • Loading branch information
ktmeaton committed Mar 27, 2020
1 parent d9388f3 commit 3c76889
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 8 deletions.
4 changes: 3 additions & 1 deletion nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,11 @@ params{
// Snippy summary files
snippy_variant_summary = "snippy_variant_summary"

// Snippy filterin
// Snippy filtering
snippy_snp_density_window = 10
snippy_variant_density = "snippy_variant_density"
snippy_multi_missing_data = 0.05
snippy_multi_missing_data_text = 5

// SQLite
sqlite_select_command = "\'SELECT AssemblyFTPGenbank FROM Master WHERE BioSampleComment NOT LIKE \"%REMOVE%\"\'"
Expand Down
56 changes: 49 additions & 7 deletions pipeline.nf
Original file line number Diff line number Diff line change
Expand Up @@ -677,24 +677,29 @@ if(!params.skip_snippy_multi){
/*
Input:
ch_():
ch_reference_genome_snippy_multi (gbff): The reference genome from process reference_download.
ch_bed_mask_snippy_multi (bed): Master masking BED file from process snippy_merge_mask_bed.
Output:
ch_ ():
ch_snippy_core_aln_filter (fasta): Multi fasta of aligned core SNPs for process snippy_multi_filter.
ch_snippy_core_full_aln_filter (fasta): Multi fasta of aligned core genome for process snippy_multi_filter.
Publish:
* (misc): All default output from snippy-core.
*/
// Other variables and config
tag ""
echo true
tag "${reference_genome_gb}"
publishDir "${params.outdir}/snippy_multi", mode: 'copy'

// IO and conditional behavior
input:
file reference_genome_gb from ch_reference_genome_snippy_multi
file bed_mask from ch_bed_mask_snippy_multi

output:
file "snippy-core.aln" into ch_snippy_core_aln_filter
file "snippy-core.full.aln" into ch_snippy_core_full_aln_filter
file "*"

// Shell script to execute
script:
Expand All @@ -708,15 +713,52 @@ if(!params.skip_snippy_multi){
# Perform multiple genome alignment (with custom filtering)
snippy-core \
--ref ${reference_genome_gb} \
--prefix raw \
--prefix snippy-core \
--mask ${bed_mask} \
--mask-char ${params.snippy_mask_char} \
\$allDir 2>&1 | tee snippy-raw.log
\$allDir 2>&1 | tee snippy-core.log
"""
}

}

process snippy_multi_filter{
/*
Input:
ch_snippy_core_full_aln_filter (fasta): Multi fasta of aligned core genome ffrom process snippy_multi.
Output:
ch_snippy_core_filter_modeltest (fasta): Multi fasta of filtered core genome sites for process modeltest.
Publish:
${snippy_core_full_aln.baseName}.filter${params.snippy_multi_missing_data_text}.fasta (fasta): Multi fasta of filtered core genome sites.
*/
// Other variables and config
tag "$snippy_core_full_aln"
publishDir "${params.outdir}/snippy_multi", mode: 'copy'

// IO and conditional behavior
input:
file snippy_core_full_aln from ch_snippy_core_full_aln_filter
output:
file "${snippy_core_full_aln.baseName}.filter${params.snippy_multi_missing_data_text}.fasta" into ch_snippy_core_filter_modeltest

// Shell script to execute
script:
"""
# Filter full genome alignment (No Missing Data)
snp-sites -m -c -b -o ${snippy_core_full_aln.baseName}.filtered.fasta ${snippy_core_full_aln};
# Filter full alignment (X% Missing Data)
${params.scriptdir}/fasta_unwrap.sh ${snippy_core_full_aln} > ${snippy_core_full_aln.baseName}.unwrap.fasta;
${params.scriptdir}/fasta_filterGapsNs.sh \
${snippy_core_full_aln.baseName}.unwrap.fasta \
${params.snippy_multi_missing_data} \
${snippy_core_full_aln.baseName}.filter${params.snippy_multi_missing_data_text}.backbone > ${snippy_core_full_aln.baseName}.filter${params.snippy_multi_missing_data_text}.fasta;
"""
}


// -------------------------------------------------------------------------- //
// Visualization MultiQC //
// -------------------------------------------------------------------------- //
Expand Down
72 changes: 72 additions & 0 deletions scripts/fasta_filterGapsNs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/bin/bash

# Assumes unwrapped fasta, no spaces between records

# File variables
INFILE=$1
MISSINGFRAC=$2
BACKBONEFILE=$3

# Remove duplicates using awk processing
awk -v missFrac="$MISSINGFRAC" -v backBoneFile="$BACKBONEFILE" '{
if(NR % 2 == 1)
{
header=$0
}
if(NR == 2)
{
for(i=1; i<= length($0); i++)
{
excludeArr[i] = 0;
}
}
if(NR % 2 == 0)
{
seqArr[header]=$0;
split($0, seq, "");
for (i=1; i <= length($0); i++)
{
if(seq[i] == "-" || toupper(seq[i]) == "N")
{
excludeArr[i]+= 1;
}
}
}
} END{
firstSeq= 1;
for (header in seqArr)
{
print header;
split(seqArr[header], seq, "");
finalSeq = "";
for (i=1; i <= length(seqArr[header]); i++)
{
if (excludeArr[i] / (NR / 2) <= missFrac)
{
finalSeq = finalSeq seq[i];
if(firstSeq == 1)
{
if (!beginBackBone)
{
beginBackBone=i;
endBackBone=i;
}
else if (i - 1 == endBackBone)
{
endBackBone=i;
}
else if (i - 1 > endBackBone)
{
print beginBackBone"-"endBackBone >> backBoneFile;
beginBackBone=i;
endBackBone=i;
}
}
}
}
if(firstSeq == 1){print beginBackBone"-"endBackBone >> backBoneFile;}
print finalSeq;
firstSeq = 0;
}
}' $INFILE

22 changes: 22 additions & 0 deletions scripts/fasta_unwrap.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

# File variables
INFILE=$1

# Create bed format using awk processing
awk 'BEGIN{RS = ">"; FS = "\n"}
{
if(NR > 1)
{
fasta_seq = ""
for (i=2; i<=NF; i++)
{
fasta_seq = fasta_seq$i
}
print ">"$1"\n"fasta_seq;
}
}' $INFILE


0 comments on commit 3c76889

Please sign in to comment.