-
Notifications
You must be signed in to change notification settings - Fork 0
/
kSNP4a
executable file
·1203 lines (901 loc) · 36.2 KB
/
kSNP4a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/bash
########################################################################
# WHERE ARE ALL THE kSNP SCRIPTS?
# IF YOU INSTALLED kSNP ANYWHERE OTHER THAN /user/local THEN YOU MUST MODIFY
# THIS TO POINT TO THE DIRECTORY WHERE YOU HAVE INSTALLED kSNP SCRIPTS
# Try to set this automatically from argv[0] (command path) - JN
# $0 references the command issued when invoking the script. The
# requirement that remains is that the other binaries continue to
# be co-located with this script, or be somewhere else in the PATH
export KSNPPATH=`dirname "${0}"`
########################################################################
#############
#############
#
# kSNP rewrite in Bash / Bourne shell starts here.
#
# We will slowly replace parts of this script's functionality with the same
# functionality in a replacement script written in Bash / Bourne. As we
# implement a feature in Bash, we will remove the corresponding lines in this
# script, until this script is empty except for a call to the kSNP_bash script.
# At that point, we will remove this script and rename the bash script.
#
#############
#############
#############
# Set up paths to required scripts. Permits us to change them in only one place.
#############
# Change this to the path to perfer or your analysis tool of choice if it is not in the $PATH
export PERFER=perfer
export ADD_PATHS="${KSNPPATH}/add_paths4"
export LE2UNIX="${KSNPPATH}/LE2Unix"
export CHECKFILENAMES="${KSNPPATH}/CheckFileNames"
export MERGE_FASTA_READ="${KSNPPATH}/merge_fasta_reads3"
export JELLYFISH="${KSNPPATH}/jellyfish"
export GETQUANTILE="${KSNPPATH}/get_quantile3"
export SUBSETMERS="${KSNPPATH}/subset_mers4.py"
export DELETEALLELECONFLICTS="${KSNPPATH}/delete_allele_conflicts3"
export PARALLELCOMMANDS="${KSNPPATH}/parallel_commands3"
export SUBSETMERLIST="${KSNPPATH}/subset_mer_list4.py"
export SNPSTOFASTAQUERY="${KSNPPATH}/SNPs2fastaQuery3"
export PICKSNPSFROMKMERS="${KSNPPATH}/pick_snps_from_kmer_genome_counts3"
export FINDALLELE="${KSNPPATH}/find_allele3"
export MUMMER="${KSNPPATH}/mummer"
export PARSEMUMMER="${KSNPPATH}/parse_mummer4kSNP3"
export NUMBERSNPS="${KSNPPATH}/number_SNPs_all4"
export RENAMEFROMTABLE="${KSNPPATH}/rename_from_table3"
export PARSESNPSTOVCF="${KSNPPATH}/parse_SNPs2VCF3"
export SNPSTOFASTAMATRIX="${KSNPPATH}/SNPs_all_2_fasta_matrix3" # NOTE: This is a .pl file and probably should be the name without a suffix. - JN
export PARSIMONATOR="${KSNPPATH}/parsimonator"
export CONSENSE="${KSNPPATH}/consense"
export FORCEBINARYTREE="${KSNPPATH}/force_binary_tree"
export FASTTREEMP="${KSNPPATH}/FastTreeMP"
export FINDCORESNPS="${KSNPPATH}/core_SNPs3"
export LABELTREENODES="${KSNPPATH}/label_tree_nodes3"
export TREENODES="${KSNPPATH}/tree_nodes3"
export SNPSTONODES="${KSNPPATH}/SNPs2nodes-new3"
export LABELTREEALLELECOUNT="${KSNPPATH}/labelTree_AlleleCount-new3"
export SNPMATRIXTODIST="${KSNPPATH}/SNP_matrix2dist_matrix3"
export DISTANCETREE="${KSNPPATH}/distance_tree3"
export FINDUNRESOLVEDCLUSTERS="${KSNPPATH}/find_unresolved_clusters3"
export GETGENBANKFILE="${KSNPPATH}/get_genbank_file4"
export ANNOTATESNPFROMGENBANK="${KSNPPATH}/annotate_SNPs_from_genbankFiles3"
export PARSEPROTEINANNOTATIONCOUNTS="${KSNPPATH}/parse_protein_annotation_counts3"
export PARANN="${KSNPPATH}/ParAnn"
export BUILDTREE="${KSNPPATH}/buildtreea"
export SUMMARIZEANNOTATION="${KSNPPATH}/summarize_annotation"
export GETFILTEREDKMERS="${KSNPPATH}/get_filtered_kmers"
export FREQCHECK="${KSNPPATH}/inline_frequency_check"
export PARTITIONKMERS="${KSNPPATH}/partitionKmers"
export FINDSNPS="${KSNPPATH}/find_snps"
export GUESSPARTITION="${KSNPPATH}/guessPartition"
# Other variables / files / directories.
export NAMEERRORSFILE="NameErrors.txt"
export THISDIR=`pwd`
export SPLITNAME="fsplit"
export FILE2GENOME="fileName2genomeName"
export HASHSIZE=1000000000 # One billion entries in jellyfish hash table.
export FINDALLELECMDS="cmds_find_allele"
export SNPSPREFIX="SNPs"
export CONFLICTSSUFFIX="conflictsDeleted"
export MERSSUFFIX="mers"
export SNPLOCISUFFIX="fasta"
export MUMMERCMDS="cmds_mummer"
export PARSEMUMMERCMDS="cmds_parse_mummer"
export MUMMEROUTPUT="mummer.out"
export SNPPOSITIONS="SNP.positions"
export COUNTSNPS="COUNT_SNPs"
export KMERSALLPREFIX="kmers_all"
export UNSORTEDKMERSPREFIX="unsortedkmers"
export NUMTREES=100
export PARSIMONYSEED=1234
export ANNOTATELIST="annotate_list"
export ANNOTATELISTHEADERS="headers."${ANNOTATELIST}""
export FASTALIST="fasta_list"
export MINIMUMK=9 # The minimum acceptable value of K
export CACHESIZE=10000000000 # Default cache size is 10G
export CACHEDIR="" # Default cache directory is null / no caching.
#############
# START Parse arguments
#############
# Technically our minimum # of arguments is 6... we can bail to error by testing
# this?
# Set default / initial values of each variable:
CMDLINE_ERROR=0 # No error initially
### Mandatory arguments
K=""
FASTALISTINPUT=""
DIR=""
### Optional arguments default value
MIN_FRACTION_WITH_LOCUS=""
ANNOTATELISTINPUT=""
SNPS_ALL_INPUT=""
GENBANKFILEINPUT=""
NUM_CPUS=0 # 0 means determine ourselves. Not accepted as user input
ALL_ANNOTATIONS=0
CORE=0
ML=0
NJ=0
VCF=0
DEBUG=0 # Permit user specifying whether debugging enabled on cmdline.
while [ ${#} -gt 0 ] # While we have more arguments to process...
do
case "${1}" in # "switch" / case based on what the argument is.
-k)
if [ -z "${K}" ] # if this is the first time k is being set...
then
K="${2}"
if [ "${MINIMUMK}" -gt "${K}" ] # Is K at least 3?
then
CMDLINE_ERROR=1
echo "Error: k must be at least ${MINIMUMK}"
elif [ 31 -lt "${K}" ] # For older versions of jellyfish, this is an issue...
then
echo "WARNING: k value greater than 31 detected. Be aware of two concerns:"
echo " 1) Older versions of the jellyfish program (before 2.x) do not support k > 31."
echo " 2) Larger values of k are more likely to obscure SNPs than lower values of k."
fi # How to test to determine k is a number?
else
# Otherwise, K was already set to something and the user tried to set it
# again, which is an error.
CMDLINE_ERROR=1
echo "Error: You may only specify -k once on the commandline."
fi
shift 2 # Argument and value
;;
-in)
if [ -z "${FASTALISTINPUT}" ] # If this is the first time this variable is being set...
then
FASTALISTINPUT=`"${ADD_PATHS}" "${2}" "${THISDIR}"`
if [ -e "${FASTALISTINPUT}" ] # Does FASTALISTINPUT file exist?
then
echo "fasta_list: ${FASTALISTINPUT}"
else
CMDLINE_ERROR=1
echo "Error: -in argument must be a file that exists (did not find ${FASTALISTINPUT})"
fi
else
# In this case, the input file was already set to something and the user tried
# to set it again, which is an error.
CMDLINE_ERROR=1
echo "Error: You may only specify -in once on the commandline."
fi
shift 2 # Argument and value
;;
-outdir)
if [ -z "${DIR}" ] # If this is the first time this variable is being set...
then
DIR=`"${ADD_PATHS}" "${2}" "${THISDIR}"`
echo "Output directory: ${DIR}"
# Where is this directory created? Do we create with -p?
else # The output directory was already set when we got here; this is an error.
CMDLINE_ERROR=1
echo "Error: You may only specify -outdir once on the commandline."
fi
shift 2 # Argument and value
;;
-min_frac)
MIN_FRACTION_WITH_LOCUS="${2}"
# TODO: Add validation?
shift 2 # Argument and value
;;
-annotate)
ANNOTATELISTINPUT=`"${ADD_PATHS}" "${2}" "${THISDIR}"`
if [ -e "${ANNOTATELISTINPUT}" ] # Does ANNOTATELISTINPUT file exist?
then
echo "annotate_list: ${ANNOTATELISTINPUT}"
else
CMDLINE_ERROR=1
echo "Error: -annotate argument must be a file that exists (did not find ${ANNOTATELISTINPUT})"
fi
shift 2 # Argument and value
;;
-SNPs_all)
SNPS_ALL_INPUT=`"${ADD_PATHS}" "${2}" "${THISDIR}"`
if [ -e "${SNPS_ALL_INPUT}" ] # Does SNPS_ALL_INPUT file exist?
then
echo "SNPs_all: ${SNPS_ALL_INPUT}"
else
CMDLINE_ERROR=1
echo "Error: -SNPs_all argument must be a file that exists (did not find ${SNPS_ALL_INPUT})"
fi
shift 2 # Argument and value
;;
-genbank)
GENBANKFILEINPUT=`"${ADD_PATHS}" "${2}" "${THISDIR}"`
if [ -e "${GENBANKFILEINPUT}" ] # Does GENBANKFILEINPUT file exist?
then
echo "genbankFile: ${GENBANKFILEINPUT}"
else
CMDLINE_ERROR=1
echo "Error: -genbank argument must be a file that exists (did not find ${GENBANKFILEINPUT})"
fi
shift 2 # Argument and value
;;
-CPU)
NUM_CPUS="${2}"
if [ 1 -gt "${NUM_CPUS}" ] # Did the user specify a silly number?
then
CMDLINE_ERROR=1
echo "Error: -CPU must be at least 1"
fi
shift 2 # Argument and value
;;
-core)
CORE=1
shift # Argument
;;
-ML)
ML=1
shift # Argument
;;
-NJ)
NJ=1
shift # Argument
;;
-vcf)
VCF=1
shift # Argument
;;
-cachedir)
if [ -z "${CACHEDIR}" ]
then
CACHEDIR=`"${ADD_PATHS}" "${2}" "${THISDIR}"`
else
CMDLINE_ERROR=1
echo "Error: -cachedir cannot be specified more than once."
fi
shift 2
;;
-nocache)
CACHEDIR=""
shift
;;
-debug)
DEBUG=1
# Same as specifying -x on the shell shebang above.
set -o xtrace
# Same as specifying -v on the shell shebang above.
set -o verbose
shift # Argument
;;
*) # All other values are errors.
CMDLINE_ERROR=1
echo "Error: Unrecognized argument (no valid argument: ${1})"
shift
;;
esac
done # Processing arguments
if [ -z "${K}" -o -z "${DIR}" -o -z "${FASTALISTINPUT}" ]
then
CMDLINE_ERROR=1
echo "Error: Must specify -k, -outdir, and -in. At least one is missing."
fi
if [ 0 -lt "${CMDLINE_ERROR}" ]
then
cat <<USAGE
Usage: kSNP3 -k <kmer length> -outdir <output directory> -in <input fasta file> [<optional arguments>...]
Required arguments:
-k <kmer_length>
-outdir <output_directory>
-in <input_fastaFile_list>
The input_fastaFile_list is a file listing the full path location of each genome
and the genome name, one line per genome, tab delimited between full path to
genome fasta file in column 1 and genome name in column 2. This format allows
multi-read,multi-chromosome, and multi-contig genomes, each genome in separate
fasta. If multiple chromosomes are listed as separate fasta entries in a single
genome file, positions and annotations are found for each gi number
Optional arguments:
-min_frac <minimum_fraction_genomes_with_locus> Create a parsimony tree based
only on SNP loci that occur in at least this fraction of
genomes, for example -min_frac 0.5
-annotate <annotate_list> File listing genome names for which to find positions
and annotate SNPs, names match column 2 of the -in file.
-core Calculate core SNPs and core SNP parsimony tree
-ML Calculate Maximum Likelihood tree
-CPU <num_CPU> Number of CPU's to use, (default to all)
-NJ Calculate a neighbor joining tree
-vcf Create a vcf file using the first genome specified in
the -positions file as the reference genome
USAGE
# Special case: If the user didn't specify -k, make a helpful suggestion to them.
if [ -z "${K}" ]
then
echo
echo "If you want a suggestion of an appropriate value for -k, you might try"
echo "running this command to analyze the input file and suggest a value:"
echo " Kchooser4 <inputfile>"
fi
exit 1
fi # if there was a parsing error.
if [ 1 -eq "${DEBUG}" ]
then
echo "Settings for this run:"
echo "K=${K}"
echo "FASTALISTINPUT=${FASTALISTINPUT}"
echo "DIR=${DIR}"
echo "MIN_FRACTION_WITH_LOCUS=${MIN_FRACTION_WITH_LOCUS}"
echo "ANNOTATELISTINPUT=${ANNOTATELISTINPUT}"
echo "SNPS_ALL_INPUT=${SNPS_ALL_INPUT}"
echo "GENBANKFILEINPUT=${GENBANKFILEINPUT}"
echo "NUM_CPUS=${NUM_CPUS}"
echo "ALL_ANNOTATIONS=${ALL_ANNOTATIONS}"
echo "CORE=${CORE}"
echo "ML=${ML}"
echo "NJ=${NJ}"
echo "VCF=${VCF}"
echo "DEBUG=${DEBUG}"
echo "CACHEDIR=${CACHEDIR}"
fi
#############
# FINISHED Parse arguments
#############
#############
# START Setup of environment, update user re: details.
#############
# Ensure our working directory is the directory specified in the commandline
# arguments, even if it doesn't exist yet (TODO - Ensure directory created in
# argument parsing above? Will be clearer that the dir will be created) - JN
if [ ! -d "${DIR}" ] # If the output directory doesn't exist...
then
mkdir -p "${DIR}" || { echo "Unable to create output / working directory: ${DIR}" ; exit 1 ; }
fi
cd "${DIR}"
# If a cache directory was specified and does not exist, create it
if [ -n "${CACHEDIR}" -a ! -d "${CACHEDIR}" ]
then
mkdir -p "${CACHEDIR}" || { echo "Unable to create cache directory ${CACHEDIR}, continuing without cache." ; CACHEDIR="" ; }
fi
# This validation should be to STDERR, and shouldn't use state? Or maybe the errors are
# complicated enough that we need to keep the details around for the user to use to address
# the errors? TODO FIXME XXX
"${PERFER}" "${CHECKFILENAMES}" "${FASTALISTINPUT}"
if [ -e "${NAMEERRORSFILE}" ]
then
echo "ERROR: kSNP terminated because error file ${NAMEERRORSFILE} is present."
echo " Please review the contents of this file, correct any errors found, remove the file, and re-run kSNP"
exit 1
fi
echo "Starting kSNP"
date
STARTSECONDS=`date +%s`
echo "Configuration for this run:"
echo "input fasta_list: ${FASTALISTINPUT}"
echo "output / working directory: ${DIR}"
echo "k=${K}"
echo "annotate_list file: ${ANNOTATELISTINPUT}"
if [ -n "${MIN_FRACTION_WITH_LOCUS}" ] # Defaults to empty string.
then
echo "min_fraction_with_locus: ${MIN_FRACTION_WITH_LOCUS}"
fi
if [ -n "${GENBANKFILEINPUT}" ] # Already checked for existence when parsing arguments.
then
echo "Genbank file for annotations (and any from NCBI with gi number which are automatically downloaded): ${GENBANKFILEINPUT}"
fi
MAXOPENFILES=`ulimit -H -n`
ulimit -n "${MAXOPENFILES}"
echo "Maximum number of open files: ${MAXOPENFILES}"
if [ 0 -eq "${NUM_CPUS}" ]
then
echo "Automatically determining number of CPUs to use:"
OS=`uname`
echo "The operating system is ${OS}"
if [ "Darwin" = "${OS}" ] # MacOS
then
NUM_CPUS=`/usr/sbin/system_profiler SPHardwareDataType | awk '/Total Number of Cores/ {print $5}'`
else # Linux
NUM_CPUS=`cat /proc/cpuinfo | grep processor | wc -l`
fi
echo "Discovered ${NUM_CPUS}"
if [ 1 -gt "${NUM_CPUS}" ]
then
NUM_CPUS=8
echo "Could not automatically determine number of CPUs available, defaulting to ${NUM_CPUS}"
fi
fi
echo "Number CPUs: ${NUM_CPUS}"
echo "SNPS_ALL_INPUT=${SNPS_ALL_INPUT}"
echo "GENBANKFILEINPUT=${GENBANKFILEINPUT}"
echo "CORE=${CORE}"
echo "ML=${ML}"
echo "NJ=${NJ}"
echo "VCF=${VCF}"
echo "DEBUG=${DEBUG}"
### Preprocess FASTA input file.
#chesk the fasta genome files to be sure line endings are Unix and fix if they are not
# Copy the original input file ${FASTALISTINPUT} to a specified name in the output directory for easier reference
cp -f "${FASTALISTINPUT}" "${FASTALIST}"
# Process each file listed in the input file to correct the line endings if needed and if possible
"${PERFER}" "${LE2UNIX}" "${FASTALIST}"
# Then process the "${FASTALIST}" file to correct its line-endings.
perl -i -pe 's/\015\012/\012/g' "${FASTALIST}" # Windows to unix
perl -i -pe 's/\015/\012/g' "${FASTALIST}" # Mac old format to unix
### Preprocess Annotation input file.
# WARNING:
# The file '"${ANNOTATELIST}"' in the working / output directory is maintained state.
# This means different behavior if the output directory is re-used.
# WARNING
# Benefits to operating on a copy? (does this program modify 'annotate_list'
# file?) - JN
if [ -e "${ANNOTATELISTINPUT}" ]
then
cp -f "${ANNOTATELISTINPUT}" "${ANNOTATELIST}"
else
touch "${ANNOTATELIST}"
fi
# Correct line endings in our copy of the annotations file.
#DOS to unix
perl -i -pe 's/\015\012/\012/g' "${ANNOTATELIST}" # Windows to unix
perl -i -pe 's/\015/\012/g' "${ANNOTATELIST}" # Mac old format to unix
# Output contents of annotate_list to the console (why?) - JN
echo "Finished genomes for finding SNP positions:"
cat "${ANNOTATELIST}"
echo ""
export FREQUENCYPREFIX='freq'
export KMERSPREFIX='kmers'
#############
#############
#
# BEGIN input file product caching work.
#
#############
#############
# Input:
# $FASTALIST
# $MERGEFASTAREAD
# $K
# $JELLYFISH
# $HASHSIZE
# $NUM_CPUS
# $CACHEDIR
# $CACHESIZE
#
# Output:
# $FILE2GENOME
# one file for each entry in $FILE2GENOME named kmers.<filename> (e.g. kmers.fsplit0)
#
"${PERFER}" "${GETFILTEREDKMERS}" "${FASTALIST}" "${MERGE_FASTA_READ}" "${K}" "${JELLYFISH}" "${HASHSIZE}" "${NUM_CPUS}" --cachedir "${CACHEDIR}" --maxcachebytes "${CACHESIZE}" "${FILE2GENOME}" "${FREQCHECK}" "${SPLITNAME}" kmers Jelly filtered --debug || exit 1
#############
#############
#
# END input file product caching work.
#
#############
#############
echo "Finished retrieving filtered kmers"
date
#############
#############
#
# BEGIN SNPs search for input into mummer
#
#############
#############
# If we start to hit memory limits, we will want to break this down further.
NUMPARTS="${NUM_CPUS}"
PARTITIONS='partitions.txt'
PARTITIONSUFFIX='.part'
SNPFASTAS='.SNPs.fasta'
date
"${PERFER}" "${GUESSPARTITION}" --input kmers.fsplit0 "${NUMPARTS}" > "${PARTITIONS}"
date
cut -f 1 "${FILE2GENOME}" |
sed -e "s/^/${KMERSPREFIX}./" |
xargs -t -n 1 -P "${NUM_CPUS}" "${PERFER}" "${PARTITIONKMERS}" --partition "${PARTITIONS}" --suffix "${PARTITIONSUFFIX}"
date
echo "Finished partitioning new kmer files."
SNPS_ALL_DATA="input_SNPs"
NUMGENOMES=`wc -l "${FILE2GENOME}" | awk '{print $1}'`
if [ -e "${SNPS_ALL_INPUT}" ] # If the SNPs_all argument points to a file that exists... (cached answers)
then
# This case represents the merging of SNPs between two runs. The union cannot include SNPs
# that are not present in one of the first or the second set of genomes, so it is likely lossy
echo "Using existing SNPs from ${SNPS_ALL_INPUT} file"
date
cp "${SNPS_ALL_INPUT}" "${SNPS_ALL_DATA}"
# Partition SNPs_all file:
# Partition the data using existing partitions.
"${PERFER}" "${PARTITIONKMERS}" --col 2 --partition "${PARTITIONS}" --suffix "${PARTITIONSUFFIX}" "${SNPS_ALL_DATA}"
# Previously created fasta files out of this... why?
date
let GENOMEPLUSSNP="${NUMGENOMES}"+1
# Generate the list of files to operate on using awk; We want to operate on all of the files
# for a given partition (e.g. input_SNPs.mers.part0 and kmers*.part0) together so that we can
# find the SNPs between genomes.
awk "BEGIN{
OFS=\"\" ;
kmer=\"${KMERSPREFIX}.${SPLITNAME}\" ;
part=\"${PARTITIONSUFFIX}\" ;
snps=\"${SNPS_ALL_DATA}\" ;
for (p = 0; p < ${NUMPARTS} ; p++) {
print snps,part,p
for (g = 0; g < ${NUMGENOMES} ; g++)
print kmer,g,part,p;
}
}" | xargs -t -n "${GENOMEPLUSSNP}" -P "${NUM_CPUS}" "${PERFER}" "${FINDSNPS}" --debug --fasta-suffix "${SNPFASTAS}" --snp-suffix ".${SNPSPREFIX}" --snps-all
date
echo "Found SNPs and merged with existing SNPs."
else
# This is the default case where we don't have a previous SNPS_ALL_INPUT file provided.
date
# Generate the list of files to operate on using awk; We want to operate on all of the files
# for a given partition (e.g. *.part0) together so that we can find the SNPs between genomes.
awk "BEGIN{
OFS=\"\" ;
kmer=\"${KMERSPREFIX}.${SPLITNAME}\" ;
part=\"${PARTITIONSUFFIX}\" ;
for (p = 0; p < ${NUMPARTS} ; p++) {
for (g = 0; g < ${NUMGENOMES} ; g++)
print kmer,g,part,p;
}
}" | xargs -t -n "${NUMGENOMES}" -P "${NUM_CPUS}" "${PERFER}" "${FINDSNPS}" --debug --fasta-suffix "${SNPFASTAS}" --snp-suffix ".${SNPSPREFIX}"
date
echo "Found SNPs"
fi
# Combine all of the parts for a given genome into a single .fasta file for input into mummer.
cat "${FILE2GENOME}" | while read FILE REST
do
KMERS="${KMERSPREFIX}.${FILE}"
# Note that these can be done in parallel, but I'd rather not manage the processes, and it should be fast.
cat "${KMERS}${PARTITIONSUFFIX}"*"${SNPFASTAS}" > "${KMERS}${SNPFASTAS}"
done
date
echo "Re-combined SNP fasta files for input into mummer"
date
# Combine all SNPs found into a single SNPs file.
#
# Note that our found SNPs have the term 'FILENAME' that needs to be replaced with the
# identifier for their particular source genome so that they can be associated correctly
# later.
cat "${FILE2GENOME}" | while read FILE REST
do
KMERS="${KMERSPREFIX}.${FILE}"
# Note that these can be done in parallel, but I'd rather not manage the processes, and it should be fast.
cat "${KMERS}${PARTITIONSUFFIX}"*"${SNPSPREFIX}" | sed -e "s/FILENAME/${FILE}/" >> "${KMERS}.${SNPSPREFIX}"
done
date
echo "Re-combined SNP fasta files for input into mummer"
date
#############
#############
#
# FINISHED SNPs search for input into mummer
#
#############
#############
# cat "${KMERS}"*"${SNPFASTAS}" > "${KMERS}${SNPFASTAS}"
date
if [ -s "${ANNOTATELIST}" ]
then
echo "Finding SNP positions in finished genomes using mummer."
date
[ -e "${MUMMERCMDS}" ] && rm "${MUMMERCMDS}"
[ -e "${PARSEMUMMERCMDS}" ] && rm "${PARSEMUMMERCMDS}"
touch "${MUMMERCMDS}" "${PARSEMUMMERCMDS}"
cat "${ANNOTATELIST}" | while read GENOME
do
TEST=`grep -w "${GENOME}" "${FILE2GENOME}" | wc -l`
FILE=`grep -w "${GENOME}" "${FILE2GENOME}" | awk '{print $1}'`
GENOMEDIRECTORY="${DIRPREFIX}.${FILE}"
if [ 0 -lt "${TEST}" ]
then
KMERS="${KMERSPREFIX}.${FILE}"
GENOMEFILE=`grep -w "${GENOME}" "${FASTALIST}" | awk -F'\011' '{print $1}'`
echo "genome: ${GENOME} in ${GENOMEDIRECTORY}"
# What does this do?? Assign filename to a meaningfully named variable rather than creating it twice.
# This turns a list of SNPs (e.g. the SNPs file per-genome) into a fasta-type file for input to mummer
SNPSINFASTAFORMAT="${GENOMEDIRECTORY}/${SNPSPREFIX}.${SNPLOCISUFFIX}"
# awk -F'\011' '{print ">" $1 "_" $2 "\n" $3 }' "${GENOMEDIRECTORY}/${SNPSPREFIX}" > "${SNPSINFASTAFORMAT}"
# These two commands output the SNP positions data.
# Mummer takes the per-SNP (loci + specific nucleotide) labeled list of interesting kmers, and finds where they are
# in the original genome(s). The output is loci with a . replacing the differing nucleotide, an underscore, and the nucleotide
# that was in the middle.
"${PERFER}" "${MUMMER}" -maxmatch -l "${K}" -b -c "${KMERS}${SNPFASTAS}" "${GENOMEFILE}" | "${PARSEMUMMER}" - > "${KMERS}.${SNPPOSITIONS}"
# echo "${MUMMER} -maxmatch -l ${K} -b -c ${SNPSINFASTAFORMAT} ${GENOMEFILE} > ${GENOMEDIRECTORY}/${MUMMEROUTPUT}" >> "${MUMMERCMDS}"
# "${PARSEMUMMER}" "${KMERS}${SNPFASTAs}.${MUMMEROUTPUT}" > "${KMERS}.${SNPPOSITIONS}"
# This is strictly a basic line-by-line parsing of mummer output; could / should be in a pipeline, as it is v. low
# complexity and would avoid writing the mummer output to disk before parsing.
# echo "${PARSEMUMMER} ${GENOMEDIRECTORY}/${MUMMEROUTPUT} > ${GENOMEDIRECTORY}/${SNPPOSITIONS}" >> "${PARSEMUMMERCMDS}"
else
echo "Not annotating ${GENOME} with mummer because it is not listed in annotations list: ${ANNOTATELISTINPUT}"
fi
done
# Mummer is capable of running in sequence on multiple "query" files (i.e. $GENOMEFILE). I suspect we aren't doing this
# so that we can parallelize the processes. Also, bucket size has an impact on mummer memory use.
# "${PARALLELCOMMANDS}" "${NUM_CPUS}" "${MUMMERCMDS}"
# "${PARALLELCOMMANDS}" "${NUM_CPUS}" "${PARSEMUMMERCMDS}"
date
echo "Finished finding SNP positions in finished genomes using mummer."
fi
export ALLSNPSUNSORTED="all_SNPs_unsorted"
export ALLSNPSSORTED="all_SNPs_sorted"
touch "${ALLSNPSUNSORTED}"
cat "${FILE2GENOME}" | while read FILE GENOME # For each genome input file...
do
KMERS="${KMERSPREFIX}.${FILE}"
GENOMEDIRECTORY="${DIRPREFIX}.${FILE}"
echo "genome: ${GENOME} in ${GENOMEDIRECTORY}"
POSITIONSFILE="${KMERS}.${SNPPOSITIONS}"
SNPSFILE="${GENOMEDIRECTORY}/${SNPSPREFIX}"
if [ -s "${POSITIONSFILE}" ] # We can create the full SNPs data that references locations in the original genome.
then
awk -F'\011' -v "f=${FILE}" '{print $1 "\t" $2 "\t" $3 "\t" f "\t" $4}' "${POSITIONSFILE}" >> "${ALLSNPSUNSORTED}"
else # mummer failed.... (we should output something here?)
# Output SNP data, though we don't have location and direction info, and for some reason we put the genome name
# where the file name should go ??? (we have both genome name and filename, so that's weird, right?)
# We can get a lot of the same mapping (filename, ascension number, genome name) from non-mummer info, so this
# seems weak sauce. Also the output is v. different in terms of the kmers / SNPs that show up (it seems).
# TODO FIXME XXX not sure what's going on here.
echo "No positions file ${KMERS}.${SNPPOSITIONS} found, generating all_SNPs_unsorted without position data."
awk -v "genome=${GENOME}" '{print $1 "\t" $2 "\tx\t" genome "\t" }' "${KMERS}.${SNPSPREFIX}" >> "${ALLSNPSUNSORTED}"
fi
done
### Now we have unsorted SNP data from all genomes in one file.
if [ -e "${SNPS_ALL_INPUT}" ] # If we have prior SNPs data...
then
# Add all of the $SNPS_ALL_INPUT file to the end of the unsorted SNPs data, omitting the first column
# which is the count.
awk -F'\011' '{print $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6 "\t" $7}' "${SNPS_ALL_INPUT}" >> "${ALLSNPSUNSORTED}"
fi
# Sort the resulting SNPs file.
"${PERFER}" sort -u "${ALLSNPSUNSORTED}" > "${ALLSNPSSORTED}"
export ALLSNPSSORTEDLABELED="all_SNPs_sorted_labelLoci"
# Create a unique number for each distinct loci, separates loci by blank line.
# Outputs to all_SNPs_sorted_labelLoci
"${PERFER}" "${NUMBERSNPS}" "${COUNTSNPS}" < "${ALLSNPSSORTED}" > "${ALLSNPSSORTEDLABELED}"
export SNPSALLOUTPUT="SNPs_all"
"${PERFER}" "${RENAMEFROMTABLE}" "${ALLSNPSSORTEDLABELED}" "${FILE2GENOME}" "${SNPSALLOUTPUT}"
# Set reference genome for vcf file to the be first finished genome, if this is empty, then set it to be the first genome in the input fasta file.
if [ -s "${ANNOTATELIST}" ]
then
REFERENCEGENOME=`head -1 "${ANNOTATELIST}"`
else
REFERENCEGENOME=`head -1 "${FILE2GENOME}" | awk '{print $2}'`
fi
export VCFFILE="VCF.${REFERENCEGENOME}.vcf"
if [ 1 -eq "${VCF}" ] # If -vcf flag was passed to kSNP, parse the SNPs_all file.
then
# Bug where this invocation has error: "Cannot open all" - FIXME TODO XXX - JN
"${PERFER}" "${PARSESNPSTOVCF}" "${SNPSALLOUTPUT}" "${VCFFILE}" "${REFERENCEGENOME}"
fi
echo "Finished finding SNPs"
date
#############
#############
#
# FINISHED Generating SNPs
#
#############
#############
#############
#############
#
# START Generating phylogenic trees
#
#############
#############
#####
#####
# Trees we are going to generate:
# - Parsimony trees of:
# * all SNPs [This one will always be generated; the others are optional]
# * min_frac SNPs (those appearing in at least that fraction of the genomes)
# * core SNPs (those appearing in all genomes)
# - ML tree of all SNPs.
# - NJ tree of all SNPs
#####
#####
# Input for tree-maker is:
# - List of SNPs to make a tree for.
# - Type of tree to make
# ???
# - Directory to go to for kSNP binaries?
#
# Output from tree-maker is....?
# All trees depend on an SNPs matrix file (or the fasta format of it), so for each of the above
# sets of SNPs (see parsimony section) we need to run $SNPSTOFASTAMATRIX
# All of the above trees, once created, have nodes placed, are re-rooted, and allele counts labeled.
# File names are somewhat in question; what files are required for each of these processes, and
# what files are considered output as a part of each of these processes. The other files are temp
# files and can be contained within a sub-routine; also the names don't matter except to avoid
# collision. Perhaps they can be created in specific sub-directories?
#############
# START Generate parsimony trees
#############
export SNPSALLMATRIX="${SNPSALLOUTPUT}_matrix"
export SNPSALLMATRIXFASTA="${SNPSALLOUTPUT}_matrix.${SNPLOCISUFFIX}"
export SNPSALLPARSIMONYTREE="tree.${SNPSALLOUTPUT}.parsimony.tre"
# Output files from $FINDCORESNPS are:
export MAJORITYSNPS="SNPs_in_majority${MIN_FRACTION_WITH_LOCUS}"
export SNPSMAJORITYMATRIX="${MAJORITYSNPS}_matrix"
export SNPSMAJORITYMATRIXFASTA="${MAJORITYSNPS}_matrix.${SNPLOCISUFFIX}"
export CORESNPS="core_SNPs"
export SNPSCOREMATRIX="${CORESNPS}_matrix"
export SNPSCOREMATRIXFASTA="${CORESNPS}_matrix.${SNPLOCISUFFIX}"
export NONCORESNPS="nonCore_SNPs"
export SNPCOUNTS="COUNT_coreSNPs" # File with statistics from core_SNPs3 run.
TREESTOBUILD="parsimony" # No option to omit parsimony trees.
if [ 1 -eq "${ML}" ]
then
TREESTOBUILD="${TREESTOBUILD} ML"
fi
if [ 1 -eq "${NJ}" ]
then
TREESTOBUILD="${TREESTOBUILD} NJ"
fi
for THISTREE in ${TREESTOBUILD} # Note: No quotes because we want the spaces to be separators between types of trees
do
# Build the tree from all the SNPs.
"${PERFER}" "${SNPSTOFASTAMATRIX}" "${SNPSALLOUTPUT}" "${SNPSALLMATRIXFASTA}" "${SNPSALLMATRIX}"
"${PERFER}" "${BUILDTREE}" "${SNPSALLOUTPUT}" "${THISTREE}"
if [ -n "${MIN_FRACTION_WITH_LOCUS}" ] # Requested a tree for SNPs in the given fraction of genomes.
then
echo "Getting SNPs in the majority based on ${MIN_FRACTION_WITH_LOCUS} fraction"
# This generates core SNPs as a side-effect.
"${PERFER}" "${FINDCORESNPS}" "${SNPSALLOUTPUT}" "${FILE2GENOME}" "${MIN_FRACTION_WITH_LOCUS}"
"${PERFER}" "${SNPSTOFASTAMATRIX}" "${MAJORITYSNPS}" "${SNPSMAJORITYMATRIXFASTA}" "${SNPSMAJORITYMATRIX}"
# Build trees for SNPs in greater than $MIN_FRACTION_WITH_LOCUS fraction of genomes
"${PERFER}" "${BUILDTREE}" "${MAJORITYSNPS}" "${THISTREE}"
if [ 1 -eq "${CORE}" ] # Also requested tree for the core SNPs
then
"${PERFER}" "${SNPSTOFASTAMATRIX}" "${CORESNPS}" "${SNPSCOREMATRIXFASTA}" "${SNPSCOREMATRIX}"
# Build trees for SNPs that are in all genomes.
"${PERFER}" "${BUILDTREE}" "${CORESNPS}" "${THISTREE}"
fi
elif [ 1 -eq "${CORE}" ] # Only requested tree for core SNPs.
then
echo "Identifying core SNPs for use in creating trees"
# There was no specific min fraction to use, so we just pick one so that we
# can benefit from the side effect that creates the core SNPs.
"${PERFER}" "${FINDCORESNPS}" "${SNPSALLOUTPUT}" "${FILE2GENOME}" 0.5