ncbr_wgs_rapid_hg38_align.snakemake

#################################
#
# snakefile for converting CIDR Sequencing data deliveries to non-PII ready for GRIS upload, running QC, and joint genotyping
#
# Susan Huse, susan.huse@nih.gov
# Frederick National Lab
# April 10, 2019
#
# Justin Lack
# Frederick National Lab
# December 11, 2019
#
#################################

## 
## Load python modules
##
import os
from os import listdir
from os.path import join
import pandas as pd
import re
import sys
from glob import glob
import datetime

##
## Set initial global variables
##
dir_renamed = os.getcwd()
dir_rawdata = join(dir_renamed, "rawdata")
dir_hla_master = "/sysapps/cluster/software/HLA-LA/1.0/HLA-LA-master"
batch_number = re.sub("^.*batch","",dir_renamed)
batch_name = "batch" + batch_number
#if int(batch_number) < 10:
#    batch_name0 = "BATCH0" + batch_number
#else:
#    batch_name0 = "BATCH" + batch_number

#dir_rawvcf = join(dir_rawdata, "MultiSampleVCF", "withGenotypeRefinement")
#VCF = [f for f in os.listdir(dir_rawvcf) if re.match(r'.*.vcf.gz$', f)][0]
#VCF = join(dir_rawvcf, VCF)
#fnames = ["cumulative_coverage_counts", "cumulative_coverage_proportions", "gene_summary", "interval_statistics", "interval_summary", "statistics", "summary"]

## Check if these are bams or crams
#if os.path.isdir(os.path.join(dir_rawdata, "CRAM")):
#    seqdir = "CRAM"
#    seqfile = ".cram"
#elif os.path.isdir(os.path.join(dir_rawdata, "BAM")):
#    seqdir = "BAM"
seqfile = ".bam"
#else:
#    print("Unable to locate input rawdata BAM or CRAM folder.  Quitting.")
#    sys.exit()


## Set variables for rerunning all of the old pedigrees
last_batch = str(int(batch_number) - 1)
#dir_peds = "/hpcdata/dir/CIDR_DATA_RENAMED/pedigrees_updated"
#dir_peds = "/data/NCBR/projects/csi_test_batch/pedigrees_updated"
todays_date = re.sub('-','',str(datetime.datetime.today()).split()[0])

##
## Read in the masterkey file 
##
#print(listdir(os.getcwd()))
df = pd.read_csv("masterkey.txt", header=0, sep='\t')
#df = df.loc[(df['Batch_Received'].isin([batch_name0, ""])) | (df['Batch_Received'].isnull())]
dict_CIDR = dict(zip(df['IDs'].tolist(), df['Names'].tolist()))
#dict_CIDR = df['Names'].tolist()
print(dict_CIDR)
#exit

configfile:"NCBR_wgs_pipeline/ncbr_wgs_references_hg38.json"
chroms = ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY","chrM"]
lanes = ["L001","L002","L003","L004"]

##
## Set rule all
##
rule all:
    input:
        recalbam = expand("BAM/{newID}.final.bam",newID=list(dict_CIDR.keys())),

rule trimmomatic:
    input:  r1 = lambda w: [join("rawdata", dict_CIDR[w.newID] + "_{lanes}_R1_001.fastq.gz")],
            r2 = lambda w: [join("rawdata", dict_CIDR[w.newID] + "_{lanes}_R2_001.fastq.gz")],
    output: one=temp("fastqs/{newID}_{lanes}.R1.trimmed.fastq.gz"),
            two=temp("fastqs/{newID}_{lanes}.R1.trimmed.unpair.fastq.gz"),
            three=temp("fastqs/{newID}_{lanes}.R2.trimmed.fastq.gz"),
            four=temp("fastqs/{newID}_{lanes}.R2.trimmed.unpair.fastq.gz"),
            err="fastqs/{newID}_{lanes}_run_trimmomatic.err"
    params: adapterfile=config['references']['trimmomatic.adapters'],rname="pl:trimmomatic"
    shell:  """
            module load trimmomatic/0.39
            trimmomatic PE -threads 24 -phred33 {input[0]} {input[1]} {output.one} {output.two} {output.three} {output.four} ILLUMINACLIP:{params.adapterfile}:3:30:10 LEADING:10 TRAILING:10 SLIDINGWINDOW:4:20 MINLEN:20 2> {output.err}
            """

rule bwa_mem:
    input:  "fastqs/{newID}_{lanes}.R1.trimmed.fastq.gz","fastqs/{newID}_{lanes}.R2.trimmed.fastq.gz"
    output: temp("BAM/{newID}_{lanes}.bam")
    params: genome=config['references']['GENOME'],rname="pl:bwamem",sample = "{newID}"
    threads: 24
    shell:  """
            module load samtools/1.8
            module load samblaster/0.1.25
            module load bwa/0.7.17
            bwa mem -M -R \'@RG\\tID:{params.sample}\\tSM:{params.sample}\\tPL:illumina\\tLB:{params.sample}\\tPU:{params.sample}\\tCN:usuhs\\tDS:wgs\' -t {threads} {params.genome} {input} | /usr/local/apps/samblaster/0.1.25/bin/samblaster -M | samtools sort -@12 -m 4G - -o {output}
            """

rule index:
      input:  bam="BAM/{newID}_{lanes}.bam"
      output: bai=temp("BAM/{newID}_{lanes}.bai"),
      params: rname="index"
      shell:  """
              module load samtools
              samtools index -@ 2 {input.bam} {output.bai}
              """

rule recal_1:
      input:  bam="BAM/{newID}.bam",
              bai="BAM/{newID}.bai",
      output: re=temp("BAM/{newID}_1_recal_data.grp")
      params: genome=config['references']['GENOME'],knowns=config['references']['KNOWNRECAL'],rname="recal1"
      shell:  """
              module load GATK/4.1.6.0
              gatk --java-options '-Xmx48g' BaseRecalibrator --input {input.bam} --reference {params.genome} {params.knowns} --output {output.re} -L 1 -L 2 -L 14 -L Y
              """

rule recal_2:
      input:  bam="BAM/{newID}.bam",
              bai="BAM/{newID}.bai",
      output: re=temp("BAM/{newID}_2_recal_data.grp")
      params: genome=config['references']['GENOME'],knowns=config['references']['KNOWNRECAL'],rname="recal2"
      shell:  """
              module load GATK/4.1.6.0
              gatk --java-options '-Xmx48g' BaseRecalibrator --input {input.bam} --reference {params.genome} {params.knowns} --output {output.re} -L 3 -L 4 -L 5 -L 21
              """

rule recal_3:
      input:  bam="BAM/{newID}.bam",
              bai="BAM/{newID}.bai",
      output: re=temp("BAM/{newID}_3_recal_data.grp")
      params: genome=config['references']['GENOME'],knowns=config['references']['KNOWNRECAL'],rname="recal3"
      shell:  """
              module load GATK/4.1.6.0
              gatk --java-options '-Xmx48g' BaseRecalibrator --input {input.bam} --reference {params.genome} {params.knowns} --output {output.re} -L 6 -L 7 -L 15 -L 16 -L 18
              """

rule recal_4:
      input:  bam="BAM/{newID}.bam",
              bai="BAM/{newID}.bai",
      output: re=temp("BAM/{newID}_4_recal_data.grp")
      params: genome=config['references']['GENOME'],knowns=config['references']['KNOWNRECAL'],rname="recal4"
      shell:  """
              module load GATK/4.1.6.0
              gatk --java-options '-Xmx48g' BaseRecalibrator --input {input.bam} --reference {params.genome} {params.knowns} --output {output.re} -L 8 -L 9 -L 10 -L 13 -L 20
              """

rule recal_5:
      input:  bam="BAM/{newID}.bam",
              bai="BAM/{newID}.bai",
      output: re=temp("BAM/{newID}_5_recal_data.grp")
      params: genome=config['references']['GENOME'],knowns=config['references']['KNOWNRECAL'],rname="recal5"
      shell:  """
              module load GATK/4.1.6.0
              gatk --java-options '-Xmx48g' BaseRecalibrator --input {input.bam} --reference {params.genome} {params.knowns} --output {output.re} -L 11 -L 12 -L 17 -L 19 -L 22 -L X
              """

rule gather_bqsr:
    input: "BAM/{newID}_1_recal_data.grp","BAM/{newID}_2_recal_data.grp","BAM/{newID}_3_recal_data.grp","BAM/{newID}_4_recal_data.grp","BAM/{newID}_5_recal_data.grp"
    output: recal = "BAM/{newID}_recal_data.grp",
            list = temp("BAM/{newID}.recals.list")
    params: sample="{newID}",rname="gather_bqsr"
    shell: """
           module load GATK/4.1.6.0
           ls BAM/{params.sample}_*_recal_data.grp > BAM/{params.sample}.recals.list
           gatk --java-options '-Xmx24g' -I BAM/{params.sample}.recals.list --use-jdk-inflater --use-jdk-deflater -O {output.recal}
           """

rule apply_recal:
      input:  bam="BAM/{newID}.bam",
              bai="BAM/{newID}.bai",
              re=temp("BAM/{newID}_recal_data.grp")
      output: bam=temp("BAM/{newID}.recal.bam"),
      params: genome=config['references']['GENOME'],rname="apply_recal"
      shell:  """
              module load GATK/4.1.6.0
              gatk --java-options '-Xmx48g' ApplyBQSR --reference {params.genome} --input {input.bam} --bqsr-recal-file {output.re} --output {output.bam} --use-jdk-inflater --use-jdk-deflater
              """