-
Notifications
You must be signed in to change notification settings - Fork 14
/
phyloscanner_make_trees.py
executable file
·2497 lines (2276 loc) · 109 KB
/
phyloscanner_make_trees.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
from __future__ import print_function
# Quit if this is being run as python 3.
import sys
if sys.version_info.major > 2:
name = sys.argv[0]
print(name, "is written in python 2; it is not compatible with python 3.",
"Sorry! Note that you can specify exactly which version of python on your",
"machine is used by running a command like\n$ python", name, "[input and",
"options]\ninstead of\n$", name, "[input and options]\nreplacing 'python' by",
"the specific executable you want.\nQuitting.", file=sys.stderr)
exit(1)
## Author: Chris Wymant, c.wymant@imperial.ac.uk
## Acknowledgement: I wrote this while funded by ERC Advanced Grant PBDR-339251
##
## Overview:
ExplanatoryMessage = '''phyloscanner_make_trees.py infers phylogenies containing
both within- and between-host pathogen genetic diversity in windows along the
genome, using mapped reads as input. For each window, for each sample, all reads
mapped to that window are found and identical reads are collected together with
an associated count. Then all reads from all samples in each window are aligned
using MAFFT and a phylogeny is constructed using RAxML or IQtree. Temporary files and
output files are written to the current working directory; to avoid overwriting
existing files, you should run phyloscanner_make_trees.py from inside an empty
directory.
More information on running phyloscanner can be found in the manual. Options are
explained below; for some options we go into greater detail in the manual,
trying to be as concise as possible here.'''
################################################################################
# The names of some files we'll create.
FileForAlignedRefs = 'RefsAln.fasta'
FileForCoords = "WindowCoordinateCorrespondence.csv"
# Some temporary working files we'll create
TempFileForRefs = 'temp_refs.fasta'
TempFileForPairwiseUnalignedRefs = 'temp_2Refs.fasta'
TempFileForPairwiseAlignedRefs = 'temp_2RefsAln.fasta'
TempFileForReads_basename = 'temp_UnalignedReads'
TempFileForOtherRefs_basename = 'temp_OtherRefs'
################################################################################
GapChar = '-'
import os
import collections
import itertools
import subprocess
import re
import copy
import shutil
import glob
import time
import argparse
import pysam
import csv
from Bio import SeqIO
from Bio import Seq
from Bio import AlignIO
from Bio import Align
from distutils.version import LooseVersion
import tools.phyloscanner_funcs as pf
# Define a function to check files exist, as a type for the argparse.
def File(MyFile):
if not os.path.isfile(MyFile):
raise argparse.ArgumentTypeError(MyFile+' does not exist or is not a file.')
return MyFile
# Define a function to check directories exist, as a type for the argparse.
def Dir(MyDir):
if not os.path.isdir(MyDir):
raise argparse.ArgumentTypeError(MyDir + \
' does not exist or is not a directory.')
return MyDir
# Define a comma-separated integers object, as a type for the argparse.
def CommaSeparatedInts(MyCommaSeparatedInts):
try:
ListOfInts = [int(value) for value in MyCommaSeparatedInts.split(',')]
except:
raise argparse.ArgumentTypeError('Unable to understand ' +\
MyCommaSeparatedInts + ' as comma-separated integers.')
else:
return ListOfInts
# A class to have new lines in argument help text
class SmartFormatter(argparse.HelpFormatter):
def _split_lines(self, text, width):
if text.startswith('R|'):
return text[2:].splitlines()
return argparse.HelpFormatter._split_lines(self, text, width)
# Set up the arguments for this script
parser = argparse.ArgumentParser(description=ExplanatoryMessage,
formatter_class=SmartFormatter)
# Positional args
parser.add_argument('BamAndRefList', type=File,
help='''R|A csv-format file listing the bam and reference files
(i.e. the fasta-format file containing the sequence to
which the reads were mapped). The first column should
be the bam file, the second column the corresponding
reference file, with a comma separating the two. An
optional third column, if present, will be used to
rename the bam files in all output. For example:
PatientA.bam,PatientA_ref.fasta,A
PatientB.bam,PatientB_ref.fasta,B''')
parser.add_argument('-Q', '--quiet', action='store_true', help='''Turns off the
small amount of information printed to the terminal (via stdout). We'll still
print warnings and errors (via stderr), and the command you ran, for logging
purposes.''')
parser.add_argument('-V', '--verbose', action='store_true', help='''Print a
little more information than usual. The --time option also provides extra
information on progress.''')
WindowArgs = parser.add_argument_group('Window options - you must choose'
' exactly one of -W, -AW, -E or -ES')
WindowArgs.add_argument('-W', '--windows', type=CommaSeparatedInts,
help='''Used to specify a comma-separated series of paired coordinates defining
the boundaries of the windows. e.g. 1,300,301,600,601,900 would define windows
1-300, 301-600, 601-900.''')
WindowArgs.add_argument('-AW', '--auto-window-params',
type=CommaSeparatedInts, help='''Used to specify 2, 3 or 4 comma-separated
integers controlling the automatic creation of regular windows. The first
integer is the width you want windows to be. (If you are not using the
--pairwise-align-to option, columns in the alignment of all references are
weighted by their non-gap fraction; see the manual for clarification.) The
second integer is the overlap between the end of one window and the start of the
next. The third integer, if specified, is the start position for the first
window (if not specified this is 1). The fourth integer, if specified, is the
end position for the last window (if not specified, windows will continue up to
the end of the reference or references).''')
WindowArgs.add_argument('-E', '--explore-window-widths',
type=CommaSeparatedInts, help='''Use this option to explore how the number of
unique reads found in each bam file in each window, all along the genome,
depends on the window width. After this option specify a comma-separated list of
integers. The first integer is the starting position for stepping along the
genome. Subsequent integers are window widths to try. Output is written to the
file specified with the --explore-window-width-file option.''')
WindowArgs.add_argument('-EF', '--explore-window-width-file', help='Used to '
'specify an output file for window width data, when the '
'--explore-window-widths option is used. Output is in in csv format.')
WindowArgs.add_argument('-ES', '--explore-window-widths-speedy',
type=CommaSeparatedInts, help='''Exactly the same as --explore-window-widths,
except that the number of unique reads is calculated before read processing
(including alignment) instead of after.''')
RecommendedArgs = parser.add_argument_group('Options we particularly recommend')
RecommendedArgs.add_argument('-A', '--alignment-of-other-refs', type=File,
help='''Used to specify an alignment of reference sequences (which need not be
those used to produce the bam files) that will be cut into the same windows as
the bam files and included in the alignment of reads, for comparison. This is
required if phyloscanner is to analyse the trees it produces.''')
RecommendedArgs.add_argument('-2', '--pairwise-align-to', help='''By default,
phyloscanner figures out where corresponding windows are in different bam files
by creating a multiple sequence alignment containing all of the mapping
references used to create the bam files (plus any extra references included with
-A), and window coordinates are intepreted with respect to this alignment.
However using this option, the mapping references used to create the bam files
are each separately pairwise aligned to one of the extra references included
with -A, and window coordinates are interpreted with respect to this
reference. Specify the name of the reference to use after this option. Using
this option is necessary if you want to run the
tools/CalculateTreeSizeInGenomeWindows.py script and feed its output into
phyloscanner_analyse_trees.R via its --normRefFileName option (see the manual
for more details).''')
RecommendedArgs.add_argument('--x-raxml', help=pf.RaxmlHelp)
RecommendedArgs.add_argument('-P', '--merge-paired-reads', action='store_true',
help='''Relevant only for paired-read data for which the mates in a pair
(sometimes) overlap with each other: with this option we merge overlapping mates
into a single (longer) read.''')
QualityArgs = parser.add_argument_group('Options intended to mitigate the '
'impact of poor quality reads')
QualityArgs.add_argument('-I', '--discard-improper-pairs', action='store_true',
help='''For paired-read data, discard all reads that are flagged as improperly
paired.''')
QualityArgs.add_argument('-Q1', '--quality-trim-ends', type=int, help='''Used to
specify a quality threshold for trimming the ends of reads. We trim each read
inwards until a base of this quality is met.''')
QualityArgs.add_argument('-Q2', '--min-internal-quality', type=int, help=\
'''Used to specify an interal quality threshold for reads. Reads are allowed at
most one base below this quality; any read with two or more bases below this
quality are discarded.''')
QualityArgs.add_argument('-MC', '--min-read-count', type=int, default=1, help=\
'''Used to specify a minimum count for each unique read.
Reads with a count (i.e. the number of times that sequence was observed,
after merging if merging is being done) less than this value are discarded.
The default value of 1 means all reads are kept.''')
OtherArgs = parser.add_argument_group('Other assorted options')
OtherArgs.add_argument('--x-raxml-old', help=pf.RaxmlOldHelp)
OtherArgs.add_argument('--x-iqtree', help=pf.IQtreeHelp)
OtherArgs.add_argument('-XC', '--excision-coords', type=CommaSeparatedInts,
help='Used to specify a comma-separated set of integer coordinates that will '
'be excised from the aligned reads. Requires the -XR flag.')
OtherArgs.add_argument('-XR', '--excision-ref', help='''Used to specify the name
of a reference (which must be present in the file you specify with -A) with
respect to which the coordinates specified with -XC are interpreted. If you are
also using the --pairwise-align-to option, you must use the same reference there
and here.''')
OtherArgs.add_argument('-MTA', '--merging-threshold-a', type=int, default=0,
help='''Used to specify a similarity threshold for merging similar reads: reads
that differ by a number of bases equal to or less than this are merged, those
with higher counts effectively absorbing those with lower counts. The default
value of 0 means there is no merging. In this version of the merging algorithm,
if B is similar enough to merge into C, and A is similar enough to merge into B
but not into C (A -> B -> C), B will be merged into C and A will be kept
separate. Contrast with --merging-threshold-b.''')
OtherArgs.add_argument('-MTB', '--merging-threshold-b', type=int, default=0,
help='''Similar to --merging-threshold-a, except that in this version of the
merging algorithm, if B is similar enough to merge into C, and A is similar
enough to merge into B but not into C (A -> B -> C), both A and B will be merged
into C.''')
OtherArgs.add_argument('-CR', '--check-recombination',
action='store_true', help='''Calculates a metric of recombination for each
sample's set of reads in each window. Calculation time scales cubically with the
number of unique reads each sample has per window, so if you're interested
in using this option it's best to play with all other options first to make sure
your windows don't have a ridiculously large number of unique reads per sample.
''')
OtherArgs.add_argument('--max-gap-frac-for-refs', type=float, default=0.5,
help='''If --alignment-of-other-refs is used, an external reference sequence
will be excluded from the current window if its fractional gap content in the
current window (in the alignment including only the references, not the reads)
exceeds this threshold. The default value is 0.5; this option allows a different
value to be specified.''')
OtherArgs.add_argument('-OD', '--output-dir', help='''Used to specify the name
of a directory into which output files will be moved.''')
OtherArgs.add_argument('--time', action='store_true',
help='Print the times taken by different steps.')
OtherArgs.add_argument('--x-mafft', default='mafft', help=''''Used to specify
the command required to run mafft (by default: mafft). Whitespace is interpreted
as separating mafft options, so if a path to the mafft binary is specified it
may not contain whitespace. See also --x-mafft2.''')
OtherArgs.add_argument('--x-mafft2', help='''"If you are using
--pairwise-align-to, by default we will use a different command for pairwise
alignment of references from what you specified with --x-mafft. Specifically, we
use whatever binary you specified there, plus ' --localpair --maxiterate 1000'
(i.e. we use linsi rather mafft). Use this option to override this default,
specifying the mafft command to be used for pairwise alignment of references.
Whitespace is interpreted as separating mafft options, so if a path to the
mafft binary is specified it may not contain whitespace.''')
OtherArgs.add_argument('--x-samtools', default='samtools', help=\
'Used to specify the command required to run samtools, if needed (by default: '
'samtools).')
OtherArgs.add_argument('-KO', '--keep-output-together', action='store_true',
help='''By default, subdirectories are made for different kinds of phyloscanner
output. With this option, all output files will be in the same directory (either
the working directory, or whatever you specify with --output-dir).''')
OtherArgs.add_argument('-KT', '--keep-temp-files', action='store_true', help='''
Keep temporary files we create on the way (these are deleted by default).''')
BioinformaticsArgs = parser.add_argument_group('Options for detailed'
' bioinformatic interrogation of the input bam files (not intended for normal'
' usage)')
BioinformaticsArgs.add_argument('-IO', '--inspect-disagreeing-overlaps',
action='store_true', help='''With --merge-paired-reads, those pairs that
overlap but disagree are discarded. With this option, these discarded pairs
are written to a bam file (one per patient, with their reference file copied
to the working directory) for your inspection.''')
BioinformaticsArgs.add_argument('--exact-window-start', action='store_true',
help='''Normally phyloscanner retrieves all reads that fully
overlap a given window, i.e. starting at or anywhere before the window start,
and ending at or anywhere after the window end. If this option is used without
--exact-window-end, the reads that are retrieved are those that start at exactly
the start of the window, and end anywhere (ignoring all the window end
coordinates specified). If this option is used with --exact-window-end, for a
read to be kept it must start at exactly the window start AND end at exactly the
window end.''')
BioinformaticsArgs.add_argument('--exact-window-end', action='store_true',
help='''With this option, the reads that are retrieved are those that end at
exactly the end of the window. Read the --exact-window-start help.''')
BioinformaticsArgs.add_argument('-CE', '--recover-clipped-ends',
action='store_true', help ='''This option recovers (soft-)clipped read ends by
setting the bases they contain to be mapped, assuming no indels inside the
clipped end. WARNING: mapping software clips the ends of reads for a reason.''')
StopEarlyArgs = parser.add_argument_group('Options to only partially run '
'phyloscanner, stopping early or skipping steps')
StopEarlyArgs.add_argument('-AO', '--align-refs-only', action='store_true',
help='''Align the mapping references used to create the bam files, plus any
extra reference sequences specified with -A, then quit without doing anything
else.''')
StopEarlyArgs.add_argument('-T', '--no-trees', action='store_true',
help='Process and align the reads from each window, then quit without making '
'trees.')
StopEarlyArgs.add_argument('-D', '--dont-check-duplicates', action='store_true',
help="Don't compare reads between samples to find duplicates - a possible "+\
"indication of contamination. (By default this check is done.)")
StopEarlyArgs.add_argument('-RNO', '--read-names-only', action='store_true',
help='''Stop analysing each window as soon as possible after writing the read
names to a file.''')
StopEarlyArgs.add_argument('-NRN', '--no-read-names', action='store_true',
help='''Do not record the correspondence between each unique sequence retained
by phyloscanner, and the reads that went into this sequence (as they are named
in the bam file), as is done by default.''')
StopEarlyArgs.add_argument('-CCO', '--coordinate-correspondence-only',
action='store_true', help="Stop straight away after producing " + \
FileForCoords + ", which shows the correspondence in window coordinates.")
DeprecatedArgs = parser.add_argument_group('Deprecated options, left here for'
'backward compatability or interest.')
DeprecatedArgs.add_argument('-C', '--contaminant-count-ratio', type=float,
help='''Used to specify a numerical value which is interpreted in the following
'way: if a sequence is found exactly duplicated between any two bam files,
'and is more common in one than the other by a factor at least equal to this
'value, the rarer sequence is deleted and goes instead into a separate
contaminant read fasta file. (This is considered deprecated because including the
exact duplicates in the tree and dealing with them during tree analysis is more
flexible.)''')
DeprecatedArgs.add_argument('-CO', '--flag-contaminants-only',
action='store_true',
help="For each window, just flag contaminant reads then move on (without "
"aligning reads or making a tree). Only makes sense with the -C flag.")
#DeprecatedArgs.add_argument('-CD', '--contaminant-read-dir', type=Dir,
#help='''A directory containing the contaminant read fasta files produced by a
#previous run of phyloscanner using the the -C flag:
#reads flagged as contaminants there will be considered contaminants in this run
#too. The windows must exactly match up between these two runs, most easily
#achieved with the -2 option. The point of this option is to first do a run with
#every bam file you have in which there could conceivably be cross-contamination,
#using the -C flag (and possibly -CO to save time), and then in subsequent runs
#focussing on subsets of bam files you will be able to identify contamination
#from outside that subset.''')
DeprecatedArgs.add_argument('-FR', '--forbid-read-repeats', action='store_true',
help='''Using this option, if a read with the same name is found
to span each of a series of consecutive, overlapping windows, it is only used in
the first window.''')
DeprecatedArgs.add_argument('-O', '--keep-overhangs', action='store_true',
help='''Keep the part of the read that overhangs the edge of the window. (By
default this is trimmed, i.e. only the part of the read inside the window is
kept.)''')
DeprecatedArgs.add_argument('-RC', '--ref-for-coords', help='''By default,
window coordinates are interpreted as alignment coordinates, in the multiple
sequence alignment of all references (which phyloscanner creates). With this
option, window coordinates are interpreted with respect to a named reference in
that alignment. Specify the name of the desired reference after this option.
This is deprecated because the --pairwise-align-to option is expected to perform
better.''')
DeprecatedArgs.add_argument('-RG', '--recombination-gap-aware',
action='store_true',
help='''By default, when calculating Hamming distances for the recombination
metric, positions with gaps are ignored. With this option, the gap character
counts as a fifth base.''')
DeprecatedArgs.add_argument('-RD', '--recombination-norm-diversity',
action='store_true', help='''By default, the normalising constant for the
recombination metric is half the number of sites in the window; with this option
it's half the number of sites in the window that are polymorphic for that bam
file.''')
DeprecatedArgs.add_argument('-RNS', '--read-names-simple', action='store_true',
help='''Produce a file for each window and each bam, simply listing the names of
the reads that phyloscanner used (but not the correspondence between these and
the set of unique sequences retained by phyloscanner after any merging etc.).
This is not affected by the --no-read-names option.''')
args = parser.parse_args()
# Shorthand
WindowCoords = args.windows
UserSpecifiedCoords = args.windows != None
AutoWindows = args.auto_window_params != None
IncludeOtherRefs = args.alignment_of_other_refs != None
QualTrimEnds = args.quality_trim_ends != None
ImposeMinQual = args.min_internal_quality != None
ExcisePositions = args.excision_coords != None
PairwiseAlign = args.pairwise_align_to != None
FlagContaminants = args.contaminant_count_ratio != None
#RecallContaminants = args.contaminant_read_dir != None
RecallContaminants = False
CheckDuplicates = not args.dont_check_duplicates
ExploreWindowWidths = args.explore_window_widths != None
ExploreWindowWidthsFast = args.explore_window_widths_speedy != None
MergeReadsA = args.merging_threshold_a > 0
MergeReadsB = args.merging_threshold_b > 0
MergeReads = MergeReadsA or MergeReadsB
PrintInfo = not args.quiet
RecombNormToDiv = args.recombination_norm_diversity
ReadNamesDetailed = not args.no_read_names
# Print how this script was called, for logging purposes.
print('phyloscanner was called thus:\n' + ' '.join(sys.argv))
#Check which tree inference tool to use
Use_raxml_old = args.x_raxml_old != None
Use_iqtree = args.x_iqtree != None
Use_raxml_ng = args.x_raxml != None
# Limit to one tree inference program
if Use_raxml_old + Use_iqtree + Use_raxml_ng > 1:
print("You may not specify more than one of these options: --x-raxml,",
"--x-raxml-old,--x-iqtree. Quitting.", file=sys.stderr)
exit(1)
# Exit if options that prevent making trees are used alongside tree inference programs
if (ExploreWindowWidths or ExploreWindowWidthsFast or args.read_names_only or args.no_trees) and \
((not args.x_raxml is None) or (not args.x_raxml_old is None) or (not args.x_iqtree is None)):
print('Options are specified that prevent trees being produced alongside specified tree inference',
'programs. Remove these options to infer trees', file=sys.stderr)
exit(1)
# Some options make trees unnecessary
if ExploreWindowWidths or ExploreWindowWidthsFast or args.read_names_only:
args.no_trees = True
# Warn if tree files exist already.
if not args.no_trees:
if Use_raxml_old and glob.glob('RAxML_*'):
print('Warning: RAxML-old files are present in the working directory. If their',
'names clash with those that phyloscanner will try to create, RAxML will',
'fail to run. Continuing.', file=sys.stderr)
elif Use_iqtree and glob.glob('IQtree_*'):
print('Warning: IQtree files are present in the working directory. If their',
'names clash with those that phyloscanner will try to create, IQtree will',
'fail to run. Continuing.', file=sys.stderr)
else:
if not args.no_trees and glob.glob('*.raxml.*'):
print('Warning: RAxML files are present in the working directory. If their',
'names clash with those that phyloscanner will try to create, RAxML will',
'fail to run. Continuing.', file=sys.stderr)
## Quit if the FileForCoords exists already.
#if os.path.isfile(FileForCoords):
# print('Error:', FileForCoords, 'exists already; quitting to prevent',
# 'overwriting. Please move, rename or delete this file as desired.',
# file=sys.stderr)
# exit(1)
TempFiles = set([])
# Try to make the output dir, if desired.
HaveMadeOutputDir = False
if args.output_dir != None:
if os.path.isdir(args.output_dir):
HaveMadeOutputDir = True
else:
try:
os.mkdir(args.output_dir)
except:
print('Problem creating the directory', args.output_dir+\
". Is this a subdirectory inside a directory that doesn't exist? (We can",
"only create one new directory at a time - not a new directory inside",
"another new one.) Continuing.", file=sys.stderr)
else:
HaveMadeOutputDir = True
# Check only one merging type is specified. Thereafter use its threshold.
if MergeReadsA and MergeReadsB:
print('You cannot specify both --merging-threshold-a and',
'--merging-threshold-b. Quitting.', file=sys.stderr)
exit(1)
if MergeReadsA:
MergingThreshold = args.merging_threshold_a
elif MergeReadsB:
MergingThreshold = args.merging_threshold_b
# Check that window coords have been specified either manually or automatically,
# or we're exploring window widths
NumWindowOptions = len([Bool for Bool in [UserSpecifiedCoords, AutoWindows,
ExploreWindowWidths, ExploreWindowWidthsFast] if Bool == True])
if NumWindowOptions != 1:
print('Exactly one of the --windows, --auto-window-params,',
'--explore-window-widths, --explore-window-widths-fast options should be',
'specified. Quitting.', file=sys.stderr)
exit(1)
# For convenience, since common stuff needs doing in both cases
if ExploreWindowWidthsFast:
ExploreWindowWidths = True
args.explore_window_widths = args.explore_window_widths_speedy
# If using automatic windows (i.e. not specifying any coordinates), the user
# should not specify a reference for their coords to be interpreted with respect
# to, nor use a directory of contaminant reads (since windows must match to make
# use of contaminant reads).
if AutoWindows and args.ref_for_coords != None:
print('The --ref-for-coords and --auto-window-params',
'options should not be specified together: the first means your',
'coordinates should be interpreted with respect to a named reference, and',
"the second means you're not specfiying any coordinates. Quitting.",
file=sys.stderr)
exit(1)
if RecallContaminants and (not UserSpecifiedCoords):
print('If using the --contaminant-read-dir option you must also specify',
'windows with the --windows option, because the former requires that the',
'windows in the current run exactly match up with those from the run that',
'produces your directory of contaminant reads. Quitting.', file=sys.stderr)
exit(1)
# If coords were specified with respect to one particular reference,
# WindowCoords will be reassigned to be the translation of those coords to
# alignment coordinates. UserCoords are the original coords, which we use for
# labelling things to keep labels intuitive for the user.
UserCoords = WindowCoords
# Find contaminant read files and link them to their windows.
if RecallContaminants:
ContaminantFilesByWindow = {}
ContaminantFileRegex = FileForDuplicateSeqs_basename + \
'InWindow_(\d+)_to_(\d+)\.fasta'
ContaminantFileRegex2 = re.compile(ContaminantFileRegex)
LeftWindowEdges = UserCoords[::2]
RightWindowEdges = UserCoords[1::2]
PairedWindowCoords = zip(LeftWindowEdges, RightWindowEdges)
for AnyFile in os.listdir(args.contaminant_read_dir):
if ContaminantFileRegex2.match(AnyFile):
# TODO: should that be .search instead of .match?
(LeftEdge, RightEdge) = ContaminantFileRegex2.match(AnyFile).groups()
(LeftEdge, RightEdge) = (int(LeftEdge), int(RightEdge))
if (LeftEdge, RightEdge) in PairedWindowCoords:
ContaminantFilesByWindow[(LeftEdge, RightEdge)] = \
os.path.join(args.contaminant_read_dir, AnyFile)
if len(ContaminantFilesByWindow) == 0:
print('Failed to find any files matching the regex', ContaminantFileRegex,
'in', args.contaminant_read_dir + '. Quitting.', file=sys.stderr)
exit(1)
# Check the contamination ratio is >= 1
if FlagContaminants and args.contaminant_count_ratio < 1:
print('The value specified with --contaminant-count-ratio must be greater',
'than 1. (It is the ratio of the more common duplicate to the less common',
'one at which we consider the less common one to be contamination; it',
"should probably be quite a lot larger than 1.) Quitting.", file=sys.stderr)
exit(1)
# Flagging contaminants requires that we check for duplicates
if args.dont_check_duplicates and FlagContaminants:
print('The --dont-check-duplicates and --contaminant-count-ratio options',
'cannot be used together: flagging contaminants requires that we check',
'duplicates. Quitting.', file=sys.stderr)
exit(1)
# The -XR and -XC flags should be used together or not all.
if (ExcisePositions and args.excision_ref == None) or \
((not ExcisePositions) and args.excision_ref != None):
print('The --excision-coords and --excision-ref options require each other:',
'use both, or neither. Quitting.', file=sys.stderr)
exit(1)
if ReadNamesDetailed and MergeReadsB:
print('The --merging-threshold-b option can only be used with the',
'--no-read-names option. Quitting.', file=sys.stderr)
exit(1)
# Check not --read-names-only if no read names are being recorded.
if args.read_names_only and not ReadNamesDetailed and \
not args.read_names_simple:
print('Using --read-names-only with --no-read-names and without',
'--read-names-simple makes no sense: no read names are being recorded.',
'Assuming this is an error, quitting.', file=sys.stderr)
exit(1)
# Sanity checks on using the pairwise alignment option.
if PairwiseAlign:
if args.ref_for_coords != None:
print('Note that if the --pairwise-align-to option is used, using the',
'--ref-for-coords as well is redundant.', file=sys.stderr)
if args.ref_for_coords != args.pairwise_align_to:
print('Furthermore you have chosen two different values for these flags,'
, 'indicating some confusion as to their use. Try again.')
exit(1)
if ExcisePositions and args.excision_ref != args.pairwise_align_to:
print('The --pairwise-align-to and --excision-ref options can only be',
'used at once if the same reference is specified for both. Qutting.',
file=sys.stderr)
exit(1)
# Sanity check on --max-gap-frac-for-refs
if not (0 <= args.max_gap_frac_for_refs < 1):
print('The value specified with --max-gap-frac-for-refs should be equal to '
'or greater than 0 and less than 1. Quitting.', file=sys.stderr)
exit(0)
def CheckMaxCoord(coords, ref):
'''Check that no coordinate is after the end of the reference with respect to
which it is supposed to be interpreted.'''
if max(coords) > len(ref.seq.ungap("-")):
print('You have specified at least one coordinate (', max(coords),
') that is larger than the length of the reference with respect to which',
' those coordinates are to be interpreted - ', ref.id, '. Quitting.',
sep='', file=sys.stderr)
exit(1)
def SanityCheckWindowCoords(WindowCoords):
'Check window coordinates come in pairs, all positive, the right > the left.'
NumCoords = len(WindowCoords)
if NumCoords % 2 != 0:
raise ValueError('An even number of WindowCoords must be specified. '+\
'Quitting.')
if any(coord < 1 for coord in WindowCoords):
raise ValueError('All WindowCoords must be greater than zero. Quitting.')
LeftWindowEdges = WindowCoords[::2]
RightWindowEdges = WindowCoords[1::2]
PairedWindowCoords = zip(LeftWindowEdges, RightWindowEdges)
for LeftWindowEdge, RightWindowEdge in PairedWindowCoords:
if LeftWindowEdge >= RightWindowEdge:
raise ValueError('You specified a window as having left edge ' +\
str(LeftWindowEdge) +' and right edge ' +str(RightWindowEdge)+\
'. Left edges should be less than their right edges. Quitting.')
exit(1)
return NumCoords
# Sanity checks on user specified WindowCoords
if UserSpecifiedCoords:
NumCoords = SanityCheckWindowCoords(WindowCoords)
# Sanity checks on auto window parameters
if AutoWindows:
NumAutoWindowParams = len(args.auto_window_params)
if not NumAutoWindowParams in [2,3,4]:
print('The --auto-window-params option requires 2, 3 or 4 integers.',
'Quitting.', file=sys.stderr)
exit(1)
WeightedWindowWidth = args.auto_window_params[0]
WindowOverlap = args.auto_window_params[1]
if NumAutoWindowParams > 2:
WindowStartPos = args.auto_window_params[2]
if WindowStartPos < 1:
print('The start position for the --auto-window-params option must be',
'greater than zero. Quitting.', file=sys.stderr)
exit(1)
if NumAutoWindowParams == 4:
WindowEndPos = args.auto_window_params[3]
else:
WindowEndPos = float('inf')
else:
WindowStartPos = 1
WindowEndPos = float('inf')
if WeightedWindowWidth <= 0:
print('The weighted window width for the --auto-window-params option must',
'be greater than zero. Quitting.', file=sys.stderr)
exit(1)
# Sanity checks on window-width exploration parameters
if ExploreWindowWidths:
if args.explore_window_width_file == None:
print('The --explore-window-widths option requires the',
'--explore-window-width-file option. Quitting.', file=sys.stderr)
exit(1)
try:
with open(args.explore_window_width_file, 'w') as f:
pass
except:
print('Unable to open', args.explore_window_width_file, 'for writing. (Is',
"it a file inside a directory that doesn't exist?). Quitting.",
file=sys.stderr)
raise
if len(args.explore_window_widths) < 2:
print('The --explore-window-widths option should be used to specify at',
'least two parameters; use the --help option for more information.',
'Quitting.', file=sys.stderr)
exit(1)
ExploreStart = args.explore_window_widths[0]
ExploreWidths = args.explore_window_widths[1:]
if ExploreStart < 1:
print('The start point for windows when exploring window widths (the '+\
'first integer specified with --explore-window-widths) cannot be less '+\
'than 1. Quitting.', file=sys.stderr)
exit(1)
ExploreWidths = sorted(ExploreWidths)
MinExploreWidth = ExploreWidths[0]
if MinExploreWidth < 2:
print('The minimum window width specified with --explore-window-widths',
'should be greater than 1. Quitting.', file=sys.stderr)
exit(1)
MaxExploreWidth = ExploreWidths[-1]
WindowWidthExplorationData = []
CheckDuplicates = False
def FindExploratoryWindows(EndPoint):
'''Returns the set of coordinates needed to step across the genome with the
desired start, end and window width.'''
# The EndPoint argument should be:
# * the ref length if --ref-for-coords or --pairwise-align-to is used
# * the length of the mapping ref if there's only one bam and no extra refs
# * otherwise, the length of the alignment of all refs
if EndPoint < ExploreStart + MaxExploreWidth:
print('With the --explore-window-widths option you specified a start',
'point of', ExploreStart, 'and your largest window width was',
str(MaxExploreWidth) + '; one or both of these values should be',
'decreased since the length of the reference or alignment of references',
'with respect to which we are interpreting coordinates is only',
str(EndPoint) + '. We need to be able to fit at least one window in',
'between the start and end. Quitting.', file=sys.stderr)
exit(1)
ExploratoryCoords = []
for width in ExploreWidths:
NextStart = ExploreStart
NextEnd = ExploreStart + width - 1
while NextEnd <= EndPoint:
ExploratoryCoords += [NextStart, NextEnd]
NextStart += width
NextEnd += width
return ExploratoryCoords
def CleanUp(TempFiles):
'''Delete temporary files we've made'''
if not args.keep_temp_files:
for TempFile in TempFiles:
try:
os.remove(TempFile)
except:
print('Failed to delete temporary file', TempFile + '. Leaving it.',
file=sys.stderr)
# Record the names of any external refs being included.
# If we're doing pairwise alignments, we'll also need gappy and gapless copies
# of the ref chosen for pairwise alignment.
# Check that any coordinates that are to be interpreted with respect to a named
# reference do not go past the end of that reference.
ExternalRefNames = []
if IncludeOtherRefs:
try:
ExternalRefAlignment = AlignIO.read(args.alignment_of_other_refs, "fasta")
except:
print('Problem reading', args.alignment_of_other_refs + ':',
file=sys.stderr)
raise
for ref in ExternalRefAlignment:
ExternalRefNames.append(ref.id)
if ref.id == args.pairwise_align_to:
RefForPairwiseAlnsGappySeq = str(ref.seq)
RefForPairwiseAlns = copy.deepcopy(ref)
RefForPairwiseAlns.seq = RefForPairwiseAlns.seq.ungap("-")
RefForPairwiseAlnsLength = len(RefForPairwiseAlns.seq)
if UserSpecifiedCoords:
CheckMaxCoord(WindowCoords, ref)
elif ExploreWindowWidths:
MaxCoordForWindowWidthTesting = RefForPairwiseAlnsLength
WindowCoords = FindExploratoryWindows(MaxCoordForWindowWidthTesting)
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
if ref.id == args.ref_for_coords:
if UserSpecifiedCoords:
CheckMaxCoord(WindowCoords, ref)
elif ExploreWindowWidths:
MaxCoordForWindowWidthTesting = len(ref.seq.ungap("-"))
WindowCoords = FindExploratoryWindows(MaxCoordForWindowWidthTesting)
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
if ref.id == args.excision_ref:
CheckMaxCoord(args.excision_coords, ref)
# Consistency checks on flags that require a ref.
for FlagName, FlagValue in (('--ref-for-coords', args.ref_for_coords),
('--pairwise-align-to', args.pairwise_align_to),
('--excision-ref', args.excision_ref)):
#('--ref-for-rooting', args.ref_for_rooting),
if FlagValue == None:
continue
if not IncludeOtherRefs:
print('The', FlagName, 'flag requires the --alignment-of-other-refs',
'flag. Quitting.', file=sys.stderr)
exit(1)
if not FlagValue in ExternalRefNames:
print('Reference', FlagValue +', specified with the', FlagName,
'flag, was not found in', args.alignment_of_other_refs +'. Quitting.',
file=sys.stderr)
exit(1)
# Remove duplicated excision coords. Sort from largest to smallest.
if ExcisePositions:
args.excision_coords = list(set(args.excision_coords))
args.excision_coords = sorted(args.excision_coords, reverse=True)
PythonPath = sys.executable
TranslateCoordsCode = pf.FindAndCheckCode(PythonPath, 'TranslateCoords.py')
FindSeqsInFastaCode = pf.FindAndCheckCode(PythonPath, 'FindSeqsInFasta_phyloscanner.py')
FindWindowsCode = pf.FindAndCheckCode(PythonPath,
'FindInformativeWindowsInFasta.py')
# Select a Test function depending on chosen tree inference program
if not args.no_trees:
if Use_raxml_old:
TreeArgList = pf.TestTreeInference(args.x_raxml_old, "RAxML-standard")
elif Use_iqtree:
TreeArgList = pf.TestTreeInference(args.x_iqtree, "IQtree")
else:
TreeArgList = pf.TestTreeInference(args.x_raxml, "RAxML-NG")
# Set up the mafft commands
if '--add' in args.x_mafft or \
(args.x_mafft2 != None and '--add' in args.x_mafft2):
print('Do not specify --add in your mafft command: we automatically use',
'--add as needed, depending on what is being aligned. Quitting.',
file=sys.stderr)
exit(1)
MafftArgList = args.x_mafft.split()
if args.x_mafft2 == None:
Mafft2ArgList = [MafftArgList[0], '--localpair', '--maxiterate', '1000']
else:
Mafft2ArgList = args.x_mafft2.split()
times = []
if args.time:
times.append(time.time())
# Read in the input bam and ref files
BamFiles, RefFiles, BamAliases, BamFileBasenames = \
pf.ReadInputCSVfile(args.BamAndRefList)
NumberOfBams = len(BamFiles)
# Don't produce duplication files if there's only one bam.
if NumberOfBams == 1:
CheckDuplicates = False
# Read in all the reference sequences. Set each seq name to be the corresponding
# alias.
RefSeqs = []
for i,RefFile in enumerate(RefFiles):
SeqList = list(SeqIO.parse(open(RefFile),'fasta'))
if len(SeqList) != 1:
print('There are', len(SeqList), 'sequences in', RefFile+'. There should',
'be exactly 1.\nQuitting.', file=sys.stderr)
exit(1)
SeqList[0].id = BamAliases[i]
RefSeqs += SeqList
def TranslateCoords(CodeArgs):
'''Runs TranslateCoordsCode with the supplied args, and returns the results as
a dict.'''
try:
CoordsString = subprocess.check_output([PythonPath, TranslateCoordsCode] + \
CodeArgs)
except:
print('Problem executing', TranslateCoordsCode +'. Quitting.',
file=sys.stderr)
raise
CoordsDict = {}
for line in CoordsString.splitlines():
# Trim leading & trailing whitespace and skip blank lines
line = line.strip()
if line == '':
continue
# Each line in the output of the TranslateCoordsCode should be a sequence
# name then the coordinates.
fields = line.split()
if len(fields) != NumCoords +1:
print('Unexpected number of fields in line\n' +line +'\nin the output '+\
'of ' +TranslateCoordsCode+'\nQuitting.', file=sys.stderr)
exit(1)
SeqName = fields[0]
coords = fields[1:]
# Convert the coordinates to integers.
# Where an alignment coordinate is inside a deletion in a particular
# sequence, TranslateCoords.py returns an integer + 0.5 for the coordinate
# with respect to that sequence. Python won't convert such figures directly
# from string to int, but we can do so via a float intermediate. This rounds
# down, i.e. to the coordinate of the base immediately to the left of the
# deletion.
for i in range(len(coords)):
if coords[i] != 'NaN':
try:
coords[i] = int(coords[i])
except ValueError:
if '.5' in coords[i]:
coords[i] = int(float(coords[i]))
else:
print('Unable to understand the coordinate', coords[i],
'as an integer in line\n' +line +'\nin the output of '+\
TranslateCoordsCode+'\nQuitting.', file=sys.stderr)
exit(1)
CoordsDict[SeqName] = coords
return CoordsDict
def SimpleCoordsFromAutoParams(RefSeqLength):
WindowEndPosLocal = min(WindowEndPos, RefSeqLength)
if WindowEndPosLocal < WindowStartPos + WeightedWindowWidth:
print('With the --auto-window-params option you specified a start',
'point of', WindowStartPos, 'and your weighted window width was',
str(WeightedWindowWidth) + '; one or both of these values should be',
'decreased because the length of your reference or your',
'specified end point is only', str(WindowEndPosLocal) + '. We need to be',
'able to fit at least one window in between the start and end. Quitting.',
file=sys.stderr)
exit(1)
WindowCoords = []
NextStart = WindowStartPos
NextEnd = WindowStartPos + WeightedWindowWidth - 1
while NextEnd <= WindowEndPosLocal:
WindowCoords += [NextStart, NextEnd]
NextStart = NextEnd - WindowOverlap + 1
NextEnd = NextStart + WeightedWindowWidth - 1
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
return WindowCoords, UserCoords, NumCoords, WindowEndPosLocal
# If there is only one bam and no other refs, no coordinate translation
# is necessary - we use the coords as they are, though setting any after the end
# of the reference to be equal to the end of the reference.
if NumberOfBams == 1 and not IncludeOtherRefs:
if args.align_refs_only:
print('As you are supplying a single bam file and no external references,',
"the --align-refs-only option makes no sense - there's nothing to align.",
"Quitting.", file=sys.stderr)
exit(1)
RefSeqLength = len(RefSeqs[0])
if AutoWindows:
WindowCoords, UserCoords, NumCoords, WindowEndPos = \
SimpleCoordsFromAutoParams(RefSeqLength)
if ExploreWindowWidths:
MaxCoordForWindowWidthTesting = RefSeqLength
WindowCoords = FindExploratoryWindows(MaxCoordForWindowWidthTesting)
NumCoords = len(WindowCoords)
UserCoords = WindowCoords
CoordsInRefs = {BamAliases[0] : WindowCoords}
# If there are at least two bam files, or if there is one but we're including
# other refs, we'll be aligning references and translating the user-specified
# coords with respect to each sequence, then storing those coords in a dict
# indexed by the ref's name.
else:
if args.verbose:
print('Now determining the correspondence between coordinates in different',
'bam files.')
# If we're separately and sequentially pairwise aligning our references to
# a chosen ref in order to determine window coordinates, do so now.
if PairwiseAlign:
# Get the coords if auto coords
if AutoWindows:
WindowCoords, UserCoords, NumCoords, WindowEndPos = \
SimpleCoordsFromAutoParams(RefForPairwiseAlnsLength)
# Find the coordinates with respect to the chosen ref, in the alignment of
# just the external refs - we'll need these later.
ExternalRefWindowCoords = \
pf.TranslateSeqCoordsToAlnCoords(RefForPairwiseAlnsGappySeq, WindowCoords)
CoordsInRefs = {}
for BamRefSeq in RefSeqs:
# Align
SeqIO.write([RefForPairwiseAlns,BamRefSeq], TempFileForPairwiseUnalignedRefs,
"fasta")
TempFiles.add(TempFileForPairwiseUnalignedRefs)
with open(TempFileForPairwiseAlignedRefs, 'w') as f:
try:
ExitStatus = subprocess.call(Mafft2ArgList + ['--quiet',
'--preservecase', TempFileForPairwiseUnalignedRefs], stdout=f)
assert ExitStatus == 0
except:
print('Problem calling mafft. Quitting.', file=sys.stderr)
raise
TempFiles.add(TempFileForPairwiseAlignedRefs)
# Translate.
# The index names in the PairwiseCoordsDict, labelling the coords found by
# coord translation, should coincide with the two seqs we're considering.
PairwiseCoordsDict = TranslateCoords([TempFileForPairwiseAlignedRefs,
args.pairwise_align_to] + [str(coord) for coord in WindowCoords])
if set(PairwiseCoordsDict.keys()) != \
set([BamRefSeq.id,args.pairwise_align_to]):
print('Malfunction of phyloscanner: mismatch between the sequences',
'found in the output of', TranslateCoordsCode, 'and the two names "' + \
BamRefSeq.id+'", "'+args.pairwise_align_to +'". Quitting.',
file=sys.stderr)
exit(1)
CoordsInRefs[BamRefSeq.id] = PairwiseCoordsDict[BamRefSeq.id]
# We're creating a global alignment of all references:
else:
# Put all the mapping reference sequences into one file. If an alignment of
# other references was supplied, add the mapping references to that
# alignment; if not, align the mapping references to each other.
SeqIO.write(RefSeqs, TempFileForRefs, "fasta")
TempFiles.add(TempFileForRefs)
if IncludeOtherRefs:
FinalMafftOptions = ['--add', TempFileForRefs, args.alignment_of_other_refs]
else:
FinalMafftOptions = [TempFileForRefs]
with open(FileForAlignedRefs, 'w') as f:
try:
ExitStatus = subprocess.call(MafftArgList + ['--quiet',
'--preservecase'] + FinalMafftOptions, stdout=f)
assert ExitStatus == 0
except:
print('Problem calling mafft. Quitting.', file=sys.stderr)
raise
if args.align_refs_only:
if PrintInfo:
print('References aligned in', FileForAlignedRefs+ \