-
Notifications
You must be signed in to change notification settings - Fork 0
/
aceseq2seg.pl
153 lines (123 loc) · 4.25 KB
/
aceseq2seg.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env perl
###############
## LIBRARIES ##
###############
use strict;
use warnings;
use Getopt::Long;
use Scalar::Util;
###############
## USAGE ##
###############
my $usage = "
perl $0
\t-f [comma separated input file list - REQUIRED]
\t-n [comma separated sample names - DEFAULT: input file list]
\t-l [log base, either 2 or 'given' - DEFAULT: given]
\t-o [output file - DEFAULT: STDOUT]
\n";
###############
## PARAMS ##
###############
## INPUT variables
my @files = (); # list of files
my @names = (); # list of sample names [optional]
my $log_base="given"; # GISTIC requires input to be in log space, expecting log-base-2 but log-base-ploidy is appropiate for non diploids [optional]
my $output = "STDOUT";
## Parameters
my $homo_del = -5;
my $homo_del_threshold = 0.3;
my $snp_threshold = 5;
my $chr_idx = 0;
my $str_idx = 1,
my $end_idx = 2;
my $tcn_idx = 5;
my $snp_idx = 13;
################
## PARSE OPTS ##
################
GetOptions ("files=s" => \@files,
"names=s" => \@names,
"logBase=s" => \$log_base,
"output=s" => \$output)
or die("ERROR: error in command line arguments\n$usage");
## CHECK @files and @names
die("\nERROR: file list too small (\"$files[0]\")\n$usage") if (length($files[0])<1);
@files = split(/,/,join(',',@files));
if (length($names[0])>1){
# use names if defined
@names = split(/,/,join(',',@names));
} else {
# else use filenames as names
@names = @files;
}
my $num_files = scalar @files;
my $num_names = scalar @names;
die("\nERROR: number of names ($num_names) does not match number of files ($num_files). Please check input parameters\n$usage") if ($num_files ne $num_names);
warn "\nNumber of input files: $num_files\n";
warn "SAMPLE\tFILE\n";
for (0..($num_files-1)){
warn "$names[$_]\t$files[$_]\n";
die("\nERROR: file '$files[$_]' not found\n$usage") unless (-e $files[$_]);
}
## CHECK log base for conv ##
if (Scalar::Util::looks_like_number($log_base)){
warn "\nLog base '$log_base' looks like a number\n";
} else {
warn "\nLog base '$log_base' DOES NOT look like a number... forcing to 'given'\n";
$log_base = "given";
}
die("\nERROR: log base ($log_base) cannot be less than 1\n$usage") if (($log_base < 1) && ($log_base ne "given"));
warn "\nLog base to use: '$log_base' (should be 2 or 'given'. Other values used at your own risk!)\n\n";
#####################
## PROCESS SAMPLES ##
#####################
## EXPECTED FILE HEADER
#tcc:1
#ploidy:3.94
#roundPloidy:4
#fullPloidy:4
#quality:0.99834736077127
#assumed sex:male
#0:chromosome 1:start 2:end 3:length 4:covRatio 5:TCN 6:SV.Type 7:c1Mean 8:c2Mean 9:dhEst 10:dhSNPs 11:genotype 12:CNA.type 13:NbrOfHetsSNPs 14:minStart 15:maxStart 16:minStop 17:maxStop
print "sample\tchromosome\tstart_position\tend_position\tnum_markers\tlog2_seg_CN-1\n";
foreach my $sample_idx (0..($num_files-1)){
my $file = $files[$sample_idx];
my $name = $names[$sample_idx];
my $sex = "female";
my $fh;
open $fh, $file or die "ERROR: could not open $file : $!\n$usage";
my $round_ploidy;
my $line;
while($line = <$fh>) {
if ($line =~ m/\#roundPloidy:(.*?)$/){
$round_ploidy = $1;
chomp($round_ploidy);
warn "$name\tploidy:$round_ploidy\tlog_base:$log_base\n";
# reset round_ploidy to defined log_base, unless sample sepcific ploidy are the log_base
$round_ploidy = $log_base unless ($log_base eq "given")
}
if ($line =~ m/\#assumed sex:(.*?)$/){
$sex = $1;
}
unless ($line =~ m/^\#/){
my @line_split = split('\t', $line);
my ($chr, $str, $end, $tcn, $num_snps) = ($line_split[$chr_idx], $line_split[$str_idx], $line_split[$end_idx], $line_split[$tcn_idx], $line_split[$snp_idx]);
# covert TCN into log space, or set to base_ploidy for low snps, or low for homodel
my $tcn_log = 0;
if ($num_snps < $snp_threshold){
$tcn_log = 0;
}
elsif ($tcn < $homo_del_threshold ) {
$tcn_log = $homo_del;
}
else {
# rescale log change to reflect changes in log2 space
$tcn_log = log( log($round_ploidy**($tcn/$round_ploidy))/log($round_ploidy) *2) / log(2) - 1;
}
$tcn_log = (int($tcn_log*1000))/1000;
print "$name\t$chr\t$str\t$end\t$num_snps\t$tcn_log\n";
}
}
close $fh;
}