config.yml

# things to update
tc_path: ~/mortazavi_lab/bin/TranscriptClean/

sr:
  bam: {sr_file}
  bam_sorted: data/sr_bulk/star/{mouse_id}_sorted.bam
  bam_index: data/sr_bulk/star/{mouse_id}_sorted.bam.bai

  bw: data/sr_bulk/bigwig/{mouse_id}_{strand}.bw


ref:
  fa_link: https://www.encodeproject.org/files/mm10_no_alt_analysis_set_ENCODE/@@download/mm10_no_alt_analysis_set_ENCODE.fasta.gz
  fa_gz: ref/genome.fa.gz
  fa: ref/genome.fa
  chrom_sizes: ref/genome_chrom_sizes.tsv
  gtf_link: https://www.encodeproject.org/files/gencode.vM21.primary_assembly.annotation_UCSC_names/@@download/gencode.vM21.primary_assembly.annotation_UCSC_names.gtf.gz
  gtf_gz: ref/annot.gtf.gz
  gtf: ref/annot.gtf
  gtf_utr: ref/annot_utr.gtf
  sjs: ref/annot_junc.bed
  talon_db: ref/ref.db
  ca_link: https://www.encodeproject.org/files/ENCFF999KXH/@@download/ENCFF999KXH.h5
  ca: ref/ca.h5
  ca_ends: ref/ca_{end_mode}.bed
  ca_ics: ref/ca_ic.tsv
  ca_annot: ref/ca_annot.h5

data:
  fastq: data/{batch}/raw/{dataset}_{flowcell}.fastq

  # mapping
  map_stats: data/{batch}/minimap2/{dataset}_{flowcell}_map_stats.txt
  sam: data/{batch}/minimap2/{dataset}_{flowcell}.sam
  bam: data/{batch}/minimap2/{dataset}_{flowcell}.bam
  sam_log: data/{batch}/minimap2/{dataset}_{flowcell}_minimap.log
  sam_rev: data/{batch}/minimap2/{dataset}_{flowcell}.rev.sam

  # transcriptclean
  sam_clean: data/{batch}/tc/{dataset}_{flowcell}_clean.sam
  fa_clean: data/{batch}/tc/{dataset}_{flowcell}_clean.fa
  sam_clean_log: data/{batch}/tc/{dataset}_{flowcell}_clean.log
  sam_clean_te_log: data/{batch}/tc/{dataset}_{flowcell}_clean.TE.log
  tc_stats: data/{batch}/tc/{dataset}_{flowcell}_tc_stats.txt


  # TODO -- put these in a separate talon_label --> study directory?
  # talon label
  sam_label: data/{batch}/talon/{dataset}_{flowcell}_labeled.sam
  bam_label_sorted: data/{batch}/talon/{dataset}_{flowcell}_labeled.sorted.bam
  bam_label_sorted_index: data/{batch}/talon/{dataset}_{flowcell}_labeled.sorted.bam.bai

  # merge labelled sams
  bam_label_merge: data/{batch}/talon/{dataset}_labeled_merged.bam
  bam_label_merge_sorted: data/{batch}/talon/{dataset}_labeled_merged_sorted.bam
  bam_label_merge_index: data/{batch}/talon/{dataset}_labeled_merged_sorted.bam.bai

  # stuff to get bigwigs
  bw: data/{batch}/bigwig/{dataset}_{strand}.bw

  # talon files - split by talon run + study
  talon_config: data/{batch}/talon/{study}/talon_{talon_run}_config.csv
  talon_db: data/{batch}/talon/{study}/annot_{talon_run}_talon.db
  read_annot: data/{batch}/talon/{study}/annot_{talon_run}_talon_read_annot.tsv
  talon_temp: data/{batch}/talon/{study}/annot_{talon_run}_temp/

  # talon files - split only by study
  ab: data/{batch}/talon/{study}/annot_talon_abundance.tsv
  filt_list: data/{batch}/talon/{study}/talon_pass_list.tsv
  filt_ab: data/{batch}/talon/{study}/annot_talon_abundance_filtered.tsv
  filt_gtf: data/{batch}/talon/{study}/annot_talon_observedOnly.gtf
  full_read_annot: data/{batch}/talon/{study}/annot_talon_read_annot.tsv

  # lapa
  lapa_config: data/{batch}/lapa/{study}/lapa_config.csv
  lapa_ends: data/{batch}/lapa/{study}/{end_mode}/{end_mode}_clusters.bed
  lapa_links: data/{batch}/lapa/{study}/tss_to_tes_links.csv
  lapa_gtf: data/{batch}/lapa/{study}/lapa_corrected.gtf
  lapa_ab: data/{batch}/lapa/{study}/lapa_corrected_abundance.tsv

  # lapa filtering
  lapa_filt_list: data/{batch}/lapa/{study}/lapa_pass_list.tsv
  lapa_filt_gtf: data/{batch}/lapa/{study}/lapa_corrected_filtered.gtf
  lapa_filt_ab: data/{batch}/lapa/{study}/lapa_corrected_abundance_filtered.tsv

  # cerberus
  ics: data/{batch}/cerberus/{study}/ics.tsv
  ends: data/{batch}/cerberus/{study}/{end_mode}.bed
  agg_ics: data/{batch}/cerberus/agg_ics.tsv
  agg_ends: data/{batch}/cerberus/agg_{end_mode}.bed
  ca_ref: data/{batch}/cerberus/ca_ref.h5

  # ceberus annot
  ca_ref_annot: data/{batch}/cerberus/ca_vM21_annot.h5
  ca_annot: data/{batch}/cerberus/ca_{study}_{cerb_run}_annot.h5 # sequential
  ca_annot_2: data/{batch}/cerberus/{study}/ca_annot.h5 # parallel / non-overlapping
  ca_ref_gtf: data/{batch}/cerberus/ca_vM21.gtf
  ca_gtf: data/{batch}/cerberus/{study}/ca.gtf
  ca_ab: data/{batch}/cerberus/{study}/ca_ab.tsv

  # swan
  swan_meta: data/{batch}/swan/{study}/swan_metadata.tsv
  sg: data/{batch}/swan/{study}/swan.p
  die_tsv: data/{batch}/swan/{genotype1}_{genotype2}_die.tsv
  adata: data/{batch}/swan/swan_{feature}_adata.h5ad

  # degs and dets
  de_tsv: data/{batch}/de/{genotype1}_{genotype2}_de_{feature}.tsv

  # trouble region
  bam_subset: data/{batch}/debug/{dataset}_subset.bam
  bam_subset_sorted: data/{batch}/debug/{dataset}_subset_sorted.bam
  bam_subset_index: data/{batch}/debug/{dataset}_subset_sorted.bam.bai

  # fusions
  bam_fusion_subset: data/{batch}/debug/{fusion_gene1}_{fusion_gene2}/{dataset}_{fusion_gene1}_{fusion_gene2}_subset.bam
  bam_fusion_subset_sorted: data/{batch}/debug/{fusion_gene1}_{fusion_gene2}/{dataset}_{fusion_gene1}_{fusion_gene2}_subset_sorted.bam
  bam_fusion_subset_index: data/{batch}/debug/{fusion_gene1}_{fusion_gene2}/{dataset}_{fusion_gene1}_{fusion_gene2}_subset_sorted.bam.bai
  fusion_read_names: data/{batch}/debug/{fusion_gene1}_{fusion_gene2}/{dataset}_{fusion_gene1}_{fusion_gene2}_read_names.txt
  all_fusion_read_names: data/{batch}/debug/{dataset}_fusion_read_names.txt

  # bam minus fusion reads
  bam_minus_fusion: data/{batch}/talon/minus_fusion/{dataset}.bam
  bam_minus_fusion_sorted: data/{batch}/talon/minus_fusion/{dataset}_sorted.bam
  bam_minus_fusion_index: data/{batch}/talon/minus_fusion/{dataset}_sorted.bam.bai


  # # trouble regions
  # bam_gfap_subset: data/{batch}/debug/gfap_eftud2/{dataset}_gfap_subset.bam
  # bam_gfap_subset_sorted: data/{batch}/debug/gfap_eftud2/{dataset}_gfap_subset_sorted.bam
  # bam_gfap_subset_index: data/{batch}/debug/gfap_eftud2/{dataset}_gfap_subset_sorted.bam.bai
  #
  # bam_eftud2_gfap_subset: data/{batch}/debug/gfap_eftud2/{dataset}_gfap_eftud2_subset.bam
  # bam_eftud2_gfap_subset_sorted: data/{batch}/debug/gfap_eftud2/{dataset}_gfap_eftud2_subset_sorted.bam
  # bam_eftud2_gfap_subset_index: data/{batch}/debug/gfap_eftud2/{dataset}_gfap_eftud2_subset_sorted.bam.bai
  #
  # eftud2_gfap_fusion_names: data/{batch}/debug/gfap_eftud2/{dataset}_gfap_eftud2_read_names.txt
  #
  #
  #
  # # trouble regions
  # bam_plp1_subset: data/{batch}/debug/plp1_bc/{dataset}_plp1_subset.bam
  # bam_plp1_subset_sorted: data/{batch}/debug/plp1_bc/{dataset}_plp1_subset_sorted.bam
  # bam_plp1_subset_index: data/{batch}/debug/plp1_bc/{dataset}_plp1_subset_sorted.bam.bai
  #
  # bam_bc_plp1_subset: data/{batch}/debug/plp1_bc/{dataset}_plp1_bc_subset.bam
  # bam_bc_plp1_subset_sorted: data/{batch}/debug/plp1_bc/{dataset}_plp1_bc_subset_sorted.bam
  # bam_bc_plp1_subset_index: data/{batch}/debug/plp1_bc/{dataset}_plp1_bc_subset_sorted.bam.bai
  #
  # bam_bc_plp1_fusion_names: data/{batch}/debug/plp1_bc/{dataset}_plp1_bc_read_names.txt