-
Notifications
You must be signed in to change notification settings - Fork 6
/
find_ORFs.php
executable file
·676 lines (613 loc) · 33.1 KB
/
find_ORFs.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
#!/usr/bin/php
<?php
/*******************************************************************************
*
* ORF-Finder analyzes the Mutalyzer results, compared to the original wiggle
* files, checks the strand of the transcript in a third file, and finds ORFs
* that have not been annotated before.
*
* Created : 2013-07-12
* Modified : 2014-10-08
* Version : 0.93
*
* Copyright : 2013-2015 Leiden University Medical Center; http://www.LUMC.nl/
* Programmer : Ing. Ivo F.A.C. Fokkema <I.F.A.C.Fokkema@LUMC.nl>
*
* Changelog : 0.5 2013-11-21
* New algorithm to detect ORF start sites.
* Archived 0.4 (2013-10-10)
* 0.6 2014-01-14
* Changed mentions of TSS to TIS, so that the Translation
* Initiation Site is not confused with Transcription Start Site.
* 0.7 2014-01-27
* It now logs the current settings into the stats file, so the
* settings can be verified when output results differ.
* It now reports the transcripts including the version, so that
* the mappings and sequence can be verified.
* 0.8 2014-07-14
* Fixed problem with parsing the Wiggle files; we were skipping
* the chrom=NC_* header, adding its positions to the last used
* chromosome (usually chrY).
* 0.9 2014-08-01
* Peaks should not be called when they do not show the highest
* coverage of the codon.
* 0.91 2014-08-01
* It sometimes happens that a gene is mapped to two different
* chromosomes (for instance NM_133362.2 (Erdr1)). The current
* data structure only allows for one chromosome per gene, which
* results in errors while analyzing the chromosome mentioned
* second in the Mutalyzer file, because the locations are linked
* to the wrong chromosome and therefore may not be present in the
* Wiggle file. We now report these cases.
* 0.92 2014-08-15
* Total coverage for all positions to be analyzed is calculated
* (equals the number of reads analyzed) before the peak calling,
* and added to the statistics file, to be able to verify if a
* lower number of peaks found is correlated with a lower number
* of reads.
* 0.93 2014-10-08
* Fixed bug; Positions after the cutoff on multiple transcripts
* were counted multiple times, which could cause the gene header
* in the normal results file to not be printed.
* ---- 2015-01-27
* With new default settings, aimed at using merged biological
* replicates.
*
*
* This work is licensed under the Creative Commons
* Attribution-NonCommercial-ShareAlike 4.0 International License. To view a
* copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/
* or send a letter to:
* Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
*
*************/
$_SETT =
array(
'version' => '0.93',
'min_coverage' => 3, // Positions with less coverage than this are ignored. NOTE: The Mutalyzer batch file has already been filtered for coverage lower than 3.
'max_upstream' => 500, // Maximum distance from known CDS look for ORFs.
'max_downstream' => 500, // Maximum distance from known CDS look for ORFs.
'peak_finding' =>
array(
'min_coverage' => 20, // Candidate peaks need to have at least a coverage of 20 (for 3 merged biological replicates, 10 for single samples).
'codon_1st_pos_min_coverage_fraction' => 0.6, // The first position of a codon must have at least 60% of all coverage in that codon.
'upstream_codons_to_check' => 5, // How many codons upstream do we check to verify a candidate peak? (coverage should be higher than the max coverage in those codons)
'downstream_codons_to_check' => 5, // How many codons downstream do we check to verify a candidate peak? (coverage should be higher than the max coverage in those codons)
'downstream_codons_max_coverage_max_fraction' => 0.1, // Downstream codons should not have a maximum coverage higher than 10% of the candidate peak.
'candidate_coverage_min_fraction' => 0.1, // Each candidate peak must have at least 10% coverage relative to the highest candidate.
'candidate_min_distance_reported_separately' => 5000, // At what distance from the pORF start site should we report candidate peaks separately?
),
'output_suffix' =>
array(
'stats' => '.ORF_analysis_results_stats.txt',
'results' => '.ORF_analysis_results.txt',
'results_after_cutoff' => '.ORF_analysis_results_after_cutoff.txt',
),
);
echo 'ORF-Finder v.' . $_SETT['version'] . "\n";
$aFiles = $_SERVER['argv'];
$sScriptName = array_shift($aFiles);
if (count($aFiles) != 4) {
die('Usage: ' . $sScriptName . ' MUTALYZER_FILE WIGGLE_FILE GENE_LIST_FILE STRAND' . "\n\n");
}
$sArgStrand = array_pop($aFiles);
if ($sArgStrand == 'F') {
$sArgStrand = '+';
} elseif ($sArgStrand == 'R') {
$sArgStrand = '-';
} elseif (!in_array($sArgStrand, array('+','-'))) {
die('Strand argument is invalid. Please choose from +, -, F or R.' . "\n\n");
}
// Check if all files can be read.
foreach ($aFiles as $sFile) {
if (!is_readable($sFile)) {
die('Unable to open ' . $sFile . '.' . "\n");
}
}
// Checking if we are allowed to create the output files.
$aFilesOut = array();
foreach ($_SETT['output_suffix'] as $sType => $sSuffix) {
$sFileOut = $aFiles[1] . $sSuffix;
if (file_exists($sFileOut)) {
if (!is_writable($sFileOut)) {
die('Can not overwrite ' . $sFileOut . ', aborting.' . "\n");
}
} elseif (!is_writable(dirname($sFileOut))) {
die('Can not create ' . $sFileOut . ', aborting.' . "\n");
}
$fOut = @fopen($sFileOut, 'w');
if (!$fOut) {
die('Unable to open file for writing: ' . $sFileOut . '.' . "\n\n");
}
$aFilesOut[$sType] = array('name' => $sFileOut, 'handler' => $fOut);
}
// Write settings to Stats file.
// Ugly looking code, but whatever.
foreach ($_SETT as $key => $val) {
if (is_array($val)) {
fputs($aFilesOut['stats']['handler'], $key . ':' . "\n");
foreach ($val as $key2 => $val2) {
fputs($aFilesOut['stats']['handler'], ' ' . $key2 . ':' . $val2 . "\n");
}
} else {
fputs($aFilesOut['stats']['handler'], $key . ':' . $val . "\n");
}
}
fputs($aFilesOut['stats']['handler'], "\n");
function sortPositions ($key1, $key2) {
// Sorts the positions.
if ($key1{0} == '*' && $key2{0} == '*') {
return (substr($key1, 1) < substr($key2, 1)? -1 : 1); // I know the values will never be equal.
} elseif ($key1{0} == '*' && $key2{0} != '*') {
return 1;
} elseif ($key1{0} != '*' && $key2{0} == '*') {
return -1;
} else {
return ($key1 < $key2? -1 : 1); // I know the values will never be equal.
}
}
// Prepare wiggle file, read into memory.
$aWiggleFile = file($aFiles[1], FILE_IGNORE_NEW_LINES);
$aCoverages = array(); // chr => array(position => coverage, ...)
$nChroms = 0;
$sChrom = '';
print('Parsing Wiggle file... ');
foreach ($aWiggleFile as $sLine) {
if (preg_match('/^variableStep chrom=(.+)$/', $sLine, $aRegs)) {
// Chromosome found.
// if (!preg_match('/^chr([0-9]+|[XYM])$/', $aRegs[1])) { // FOR NOW, IGNORE chrM!!!
if (!preg_match('/^chr([0-9]+|[XY])$/', $aRegs[1])) {
$sChrom = '';
} else {
$sChrom = $aRegs[1];
$nChroms ++;
$aCoverages[$sChrom] = array();
}
continue;
}
if ($sChrom) {
list($nPos, $nCoverage) = explode("\t", $sLine);
$aCoverages[$sChrom][$nPos] = (int) $nCoverage;
}
}
print('done.' . "\n");
fputs($aFilesOut['stats']['handler'], 'Loaded ' . $nChroms . ' chromosomes with ' . (count($aCoverages, true) - $nChroms) . ' positions in memory.' . "\n");
// Prepare gene file.
$aTranscriptFile = file($aFiles[2], FILE_IGNORE_NEW_LINES);
unset($aTranscriptFile[0]); // Header.
$aTranscripts = array();
print('Parsing gene file... ');
foreach ($aTranscriptFile as $sLine) {
if (!trim($sLine) || $sLine{0} == '#') {
continue;
}
list($sTranscript, $sStrand, $sGene) = explode("\t", $sLine);
$aTranscripts[$sTranscript] = array($sGene, $sStrand);
}
print('done.' . "\n");
fputs($aFilesOut['stats']['handler'], 'Loaded ' . count($aTranscripts) . ' transcripts in memory.' . "\n");
// Now, loop the Mutalyzer file, find coverages in array (if not present, complain), check strand in gene list, and summarize results.
$nFiltered = 0;
$nIntronicPositions = 0;
$nUnmappable = 0;
$nIntergenic = 0;
$aPositionsPerGene = array(); // Will contain genes as keys, with an array of positions, which is an array of positions per transcript and their mappings to the genome.
$aTranscriptsPerPosition = array(); // Stores how many positions have how many transcripts, pure statistics.
$aCodonPositions = array();
$aUnknownTranscripts = array();
$aMutalyzerResults = file($aFiles[0]);
unset($aMutalyzerResults[0]); // Header.
print('Parsing mutalyzer results file... ');
foreach ($aMutalyzerResults as $sLine) {
$aLine = explode("\t", rtrim($sLine)); // Removing whitespace from the right.
$sVariant = array_shift($aLine);
list($sChr, $nPosition) = explode(';', preg_replace('/^(chr(?:[0-9]+|[XYM])):g\.(\d+)del$/', "$1;$2", $sVariant));
// Get coverage.
if (!isset($aCoverages[$sChr][$nPosition])) {
die("\n" .
'Cannot find coverage for position ' . $sVariant . ', probably you selected the wrong Wiggle file for this Mutalyzer file?' . "\n");
}
// Filter for low coverage.
if ($aCoverages[$sChr][$nPosition] < $_SETT['min_coverage']) {
// NOTE: The Mutalyzer batch file has already been filtered for coverage lower than 3.
$nFiltered ++;
continue;
}
// We need at least 4 values; input, errors, chrom_var, var_on_transcript.
$nTranscripts = count($aLine) - 2;
if ($nTranscripts < 0) {
$nTranscripts = 0;
}
// Prevent notices...
if (!isset($aTranscriptsPerPosition[$nTranscripts])) {
$aTranscriptsPerPosition[$nTranscripts] = 0;
}
$aTranscriptsPerPosition[$nTranscripts] ++;
if ($nTranscripts >= 1) {
$sError = array_shift($aLine);
if ($sError) {
// If we have an error, would we ever have an array of at least 4? I think not...
die("\n" .
'Position ' . $sVariant . ' somehow generated an error: ' . $sError . "\n");
}
array_shift($aLine); // We're ignoring the mapping on the chromosome, which was our input anyways.
// What is left is an array with at least one mapping to a transcript.
$b5UTR = false;
$bCoding = false; // 'Coding region' also applies to -15 to -1.
$bIntronic = false;
$b3UTR = false;
// Store all options first, then decide what to do depending on the resulting options.
$aCodonOptions = array();
foreach ($aLine as $sVOT) {
if (preg_match('/^([NX]R_\d+)/', $sVOT)) {
// Non-coding RNA... ignore it, even though it probably doesn't do much.
continue;
} elseif (!preg_match('/^(([NX][RM]_\d+)\.\d+):(.+)/', $sVOT, $aRegs)) {
die("\n" .
'Cannot parse variant ' . $sVOT . "\n");
}
$sTranscriptWithVersion = $aRegs[1];
$sTranscript = $aRegs[2];
$sPosition = $aRegs[3];
// Check if we have info on the transcript, then check strand and store positions!
if (!isset($aTranscripts[$sTranscript])) {
// This happens quite a lot...
if (!in_array($sTranscript, $aUnknownTranscripts)) {
$aUnknownTranscripts[] = $sTranscript;
}
continue; // On to the next VOT for this position.
}
if ($sArgStrand == $aTranscripts[$sTranscript][1]) {
// Correct strand!
if (preg_match('/^[cn]\.(\*)?(\-?\d+)del$/', $sPosition, $aRegs)) {
// Got one!
$aCodonOptions[$sTranscriptWithVersion] = $aRegs[1] . $aRegs[2];
if ($aRegs[1]) {
// 3'UTR!
$b3UTR = true;
} elseif ($aRegs[2] < -15) {
$b5UTR = true; // Other values >= -15 && < 0 are very valuable and not regarded 5'UTR.
} else {
$bCoding = true;
}
} elseif (preg_match('/^[cn]\.(\*)?(\-?\d+)([+-]\d+)del$/', $sPosition)) {
// Intronic, we must have got the wrong transcript here!
$bIntronic = true;
} else {
// Euh....
die($sPosition);
}
}
}
// Now, decide, based on the options, what to do.
// If we have no options, disregard position.
if (!count($aCodonOptions)) {
// Has not been mapped to any transcript on the correct strand!
if ($bIntronic) {
// We did find intronic mappings...
$nIntronicPositions ++;
} else {
// Nothing on this strand...
$nUnmappable ++;
}
continue; // Continue to next position.
}
// Now filter distances, more than 500bp up- or downstream is too much.
foreach ($aCodonOptions as $sTranscript => $sPosition) {
if ($sPosition{0} == '-' && $sPosition < -$_SETT['max_upstream']) {
unset($aCodonOptions[$sTranscript]);
} elseif ($sPosition{0} == '*' && substr($sPosition, 1) > $_SETT['max_downstream']) {
unset($aCodonOptions[$sTranscript]);
}
}
// Check if we filtered it out completely now.
if (!count($aCodonOptions)) {
// Too far from any known transcript!
$nIntergenic ++;
continue; // Continue to next position.
}
// Check if the 5' and 3' flags are still correct.
$b5UTR = false;
$b3UTR = false;
foreach ($aCodonOptions as $sPosition) {
if ($sPosition{0} == '-' && $sPosition < -15) {
$b5UTR = true;
} elseif ($sPosition{0} == '*') {
$b3UTR = true;
}
}
// If we have 3'UTR positions, but also 5' UTR or coding region positions, remove the 3' positions.
if ($b3UTR && ($b5UTR || $bCoding)) {
foreach ($aCodonOptions as $sTranscript => $sPosition) {
if ($sPosition{0} == '*') {
unset($aCodonOptions[$sTranscript]);
}
}
$b3UTR = false;
}
// Now that we're done filtering, save all positions per gene, so we can loop through it and try and find patterns.
foreach ($aCodonOptions as $sTranscriptWithVersion => $sPosition) {
$sTranscript = substr($sTranscriptWithVersion, 0, strpos($sTranscriptWithVersion, '.'));
list($sGene, $sStrand) = $aTranscripts[$sTranscript];
// Create gene array if it doesn't exist.
if (!isset($aPositionsPerGene[$sGene])) {
$aPositionsPerGene[$sGene] = array('chr' => $sChr, 'strand' => $sStrand, 'positions' => array(), 'unique_positions' => array(), 'unique_positions_analyzed' => array());
}
// If we don't know this transcript yet, add it to the list.
if (!isset($aPositionsPerGene[$sGene]['positions'][$sTranscriptWithVersion])) {
$aPositionsPerGene[$sGene]['positions'][$sTranscriptWithVersion] = array();
}
// Store position on transcript.
$aPositionsPerGene[$sGene]['positions'][$sTranscriptWithVersion][$sPosition] = $nPosition;
}
} elseif (count($aLine) == 1) {
// Generated an error!
$sError = $aLine[0];
die('Position ' . $sVariant . ' somehow generated an error: ' . $sError . "\n");
} else {
// Unmappable...
$nUnmappable ++;
}
}
$nGenesMapped = count($aPositionsPerGene);
print('done.' . "\n");
fputs($aFilesOut['stats']['handler'], 'Loaded ' . $nGenesMapped . ' genes with mappings.' . "\n");
sort($aUnknownTranscripts);
fputs($aFilesOut['stats']['handler'], 'Total positions:' . "\t" . count($aMutalyzerResults) . "\n" .
'Filtered for low coverage:' . "\t" . $nFiltered . "\n" .
'Transcripts not found in gene file:' . "\t" . count($aUnknownTranscripts) . "\t" . implode(';', $aUnknownTranscripts) . "\n" .
'Transcripts per position:' . "\n" .
'Transcripts' . "\t" . 'Count' . "\n");
ksort($aTranscriptsPerPosition);
foreach($aTranscriptsPerPosition as $nTranscripts => $nPositions) {
fputs($aFilesOut['stats']['handler'], $nTranscripts . "\t" . $nPositions . "\n");
}
fputs($aFilesOut['stats']['handler'], "\n" .
'Positions left after filtering:' . "\t" . array_sum($aTranscriptsPerPosition) . "\n" .
'Positions not mappable:' . "\t" . $nUnmappable . "\t" . 'Possible causes: no mapping by Mutalyzer, transcript is missing, or strand is wrong' . "\n" .
'Positions intronic only:' . "\t" . $nIntronicPositions . "\t" . 'Possible causes: transcript is missing, or strand is wrong' . "\n" .
'Positions too far from known genes:' . "\t" . $nIntergenic . "\t" . 'Possible causes: transcript is missing, newly discovered transcript, or strand is wrong' . "\n" .
'Positions left:' . "\t" . (array_sum($aTranscriptsPerPosition) - $nUnmappable - $nIntronicPositions - $nIntergenic) . "\n");
print('Now looking for peaks ');
// Now we'll look for candidate peaks. We'll start walking through the transcript, starting from 5' to 3', looking for peaks.
$nGenesDiscarded = 0;
$i = 0;
$aResults = array(); // Here we will store the list of found peaks per gene.
$nTotalCoverageForAllPositions = 0; // Sums up the total coverage for all positions left in $aPositionsPerGene, for the stats.
// FOR DEBUGGING PURPOSES, UNCOMMENT THE LINE BELOW AND CONSTRUCT THE ARRAY USING THE GENE(S) YOU WISH TO DEBUG.
//$aPositionsPerGene = array('Son' => $aPositionsPerGene['Son']);
foreach ($aPositionsPerGene as $sGene => $aGene) {
if (!(++$i%25)) {
print('.');
}
foreach ($aGene['positions'] as $sTranscript => $aPositions) {
if ($aGene['strand'] == '-' && count($aPositions) > 1) {
uksort($aPositions, 'sortPositions');
}
// Store positions, for statistics.
$aPositionsPerGene[$sGene]['unique_positions'] = array_merge($aPositionsPerGene[$sGene]['unique_positions'], array_values($aPositions));
foreach ($aPositions as $sPosition => $nPosition) {
if (!isset($aCoverages[$aGene['chr']][$nPosition])) {
// This occurs, of the position is in $aMutalyzerResults, but not in $aWiggleFile, for instance if a gene
// is located on more than one chromosome (like Erdr1). We already verified in the first loop, that each
// position in the Mutalyzer file has a coverage. But our data structure allows only one chromosome per
// gene, so the chromosome first mentioned in the Mutalyzer file, is stored.
// We choose to report it, because the user may want to know, but on the other hand there's nothing we
// can do. Analysis on the chromosome mentioned second in the Mutalyzer file, will not work.
print("\n" .
'Position not found, ' . $aGene['chr'] . ':' . $nPosition . ' has no coverage; ' . $sTranscript . ' (' . $sGene . ') found on two chromosomes maybe?');
// Prevent further notices.
$aCoverages[$aGene['chr']][$nPosition] = 0;
continue;
}
if ($aCoverages[$aGene['chr']][$nPosition] < $_SETT['peak_finding']['min_coverage']) {
// Coverage of this peak is not high enough for analysis.
continue;
}
$b3UTR = (substr($sPosition, 0, 1) == '*'); // There should not be a lot of these.
$b5UTR = (substr($sPosition, 0, 1) == '-');
// Store position, for statistics.
$aPositionsPerGene[$sGene]['unique_positions_analyzed'][] = $nPosition;
// Step 1: Its coverage should be higher than that of the positions
// located 3, 6, 9, 12 and 15 nucleotides upstream of it.
for ($j = 1; $j <= $_SETT['peak_finding']['upstream_codons_to_check']; $j++) {
if ($b3UTR) {
// FIXME: See note in README about this not working when crossing the stop codon position.
$sPositionToCheck = '*' . (substr($sPosition, 1) - (3*$j));
} else {
$sPositionToCheck = $sPosition - (3*$j);
if (!$b5UTR && $sPositionToCheck <= 0) {
// We crossed the -1/1 border.
$sPositionToCheck--;
}
}
$nCoverage = (!isset($aPositions[$sPositionToCheck])? 0 : $aCoverages[$aGene['chr']][$aPositions[$sPositionToCheck]]);
if ($aCoverages[$aGene['chr']][$nPosition] <= $nCoverage) {
// Coverage is NOT higher than coverage of previous position.
continue 2;
}
}
// Step 2a: It should show a triplet periodicity...
$nMaxCoverageOfCodon = $nCoverageOfCodon = $aCoverages[$aGene['chr']][$nPosition];
// Firstly, this position should be the highest of the codon.
// The problem is, that when the codon_1st_pos_min_coverage_fraction factor is set below 0.5,
// that we could in theory call a peak that has the second highest coverage of the codon.
// Also, the two other nucleotides in the possible codon, so the two nucleotides downstream of the position
// we're analyzing, should not have an average coverage higher than 20% (default setting) of the position's coverage.
for ($j = 1; $j <= 2; $j++) {
if ($b3UTR) {
$sPositionToCheck = '*' . ((int) substr($sPosition, 1) + $j);
} else {
$sPositionToCheck = $sPosition + $j;
if ($b5UTR && $sPositionToCheck >= 0) {
// We crossed the -1/1 border.
$sPositionToCheck++;
}
}
$nCoverage = (!isset($aPositions[$sPositionToCheck])? 0 : $aCoverages[$aGene['chr']][$aPositions[$sPositionToCheck]]);
$nMaxCoverageOfCodon = max($nMaxCoverageOfCodon, $nCoverage);
$nCoverageOfCodon += $nCoverage;
}
if ($aCoverages[$aGene['chr']][$nPosition] != $nMaxCoverageOfCodon) {
// Coverage is NOT the highest coverage of this codon.
continue;
}
if (($aCoverages[$aGene['chr']][$nPosition]/$nCoverageOfCodon) < $_SETT['peak_finding']['codon_1st_pos_min_coverage_fraction']) {
// Coverage is NOT at least the minimum required percentage of this codon's coverage.
continue;
}
// Step 2b: ... and a clear "harringtonine pattern".
// The following 5 codons should also not have a higher maximum coverage than this position's coverage.
// If any of the following 5 codons have a maximum coverage higher than 10% of the coverage of the position we're analyzing,
// that codon must not show a conflicting triplet periodicity pattern (see 'codon_1st_pos_min_coverage_fraction' check above).
for ($j = 1; $j <= $_SETT['peak_finding']['downstream_codons_to_check']; $j++) {
$nCoverageOfCodon = 0;
$nMaxCoverageOfCodon = 0;
$sPositionFirstOfCodon = '';
for ($k = 0; $k <= 2; $k++) {
if ($b3UTR) {
$sPositionToCheck = '*' . ((int) substr($sPosition, 1) + (3*$j) + $k);
} else {
$sPositionToCheck = $sPosition + (3*$j) + $k;
if ($b5UTR && $sPositionToCheck >= 0) {
// We crossed the -1/1 border.
$sPositionToCheck++;
}
}
if (!$k) {
$sPositionFirstOfCodon = $sPositionToCheck;
}
$nCoverage = (!isset($aPositions[$sPositionToCheck])? 0 : $aCoverages[$aGene['chr']][$aPositions[$sPositionToCheck]]);
$nCoverageOfCodon += $nCoverage;
$nMaxCoverageOfCodon = max($nMaxCoverageOfCodon, $nCoverage);
}
if ($nMaxCoverageOfCodon > $aCoverages[$aGene['chr']][$nPosition]) {
// Max coverage of downstream codon IS higher.
continue 2;
} elseif ($nMaxCoverageOfCodon > ($_SETT['peak_finding']['downstream_codons_max_coverage_max_fraction']*$aCoverages[$aGene['chr']][$nPosition])) {
// Max coverage of downstream codon is higher than 10% of the current codon. This codon should now NOT have a conflicting periodicity.
$nCoverage = (!isset($aPositions[$sPositionFirstOfCodon])? 0 : $aCoverages[$aGene['chr']][$aPositions[$sPositionFirstOfCodon]]);
if ($nCoverage != $nMaxCoverageOfCodon) {
// Coverage of first position is NOT the highest coverage of this codon, codon does NOT show correct triplet periodicity.
continue 2;
}
if (($nCoverage/$nCoverageOfCodon) < $_SETT['peak_finding']['codon_1st_pos_min_coverage_fraction']) {
// Coverage of first position is NOT at least the minimum required percentage of this codon's coverage, codon does NOT show correct triplet periodicity.
continue 2;
}
}
}
// This position has passed all tests and will be reported.
if (!isset($aResults[$sGene])) {
$aResults[$sGene] = array('report_separately' => 0, 'positions' => array());
}
if (!isset($aResults[$sGene]['positions'][$nPosition])) {
$aResults[$sGene]['positions'][$nPosition] = array();
}
$aResults[$sGene]['positions'][$nPosition][$sTranscript] = $sPosition;
// Mark as potential false positive when too far downstream, but not in the 3'UTR,
// because of the background in the coding region >= 5KB from the pORF start site.
if (!$b5UTR && !$b3UTR && $sPosition >= $_SETT['peak_finding']['candidate_min_distance_reported_separately']) {
// 2014-10-08; 0.93; Only mark if mark is not already set for a different transcript; otherwise there's a risk of the gene header not being printed.
if (empty($aResults[$sGene]['positions'][$nPosition]['report_separately'])) {
$aResults[$sGene]['report_separately'] ++;
$aResults[$sGene]['positions'][$nPosition]['report_separately'] = true;
}
}
// Continue 5 codons downstream.
// FIXME: How to nicely skip the next 5 codons?
}
}
$aPositionsPerGene[$sGene]['unique_positions'] = array_unique($aPositionsPerGene[$sGene]['unique_positions']);
$aPositionsPerGene[$sGene]['unique_positions_analyzed'] = array_unique($aPositionsPerGene[$sGene]['unique_positions_analyzed']);
// 2014-08-15; 0.92; We calculate the total coverage that is represented by the positions that were left before the peak calling.
// Since we need the list of unique positions, we need to calculate it here, after the peak calling.
foreach ($aPositionsPerGene[$sGene]['unique_positions'] as $nPosition) {
$nTotalCoverageForAllPositions += $aCoverages[$aGene['chr']][$nPosition];
}
// Step 3: Per gene, from all its found possible ORF starts, we will take the one with the highest coverage as a reference, and discard any
// other candidate ORF starting points that do not have at least a coverage (on that position) of 10% of the reference (highest candidate).
if (!isset($aResults[$sGene])) {
continue;
}
$nMaxCoverage = 0;
foreach ($aResults[$sGene]['positions'] as $nPosition => $aPosition) {
$nMaxCoverage = max($nMaxCoverage, $aCoverages[$aGene['chr']][$nPosition]);
}
foreach ($aResults[$sGene]['positions'] as $nPosition => $aPosition) {
if ($aCoverages[$aGene['chr']][$nPosition] < $_SETT['peak_finding']['candidate_coverage_min_fraction']*$nMaxCoverage) {
// Coverage is not enough to be reported!
if (isset($aResults[$sGene]['positions'][$nPosition]['report_separately'])) {
$aResults[$sGene]['report_separately'] --;
}
unset($aResults[$sGene]['positions'][$nPosition]);
}
}
// Step 4: Genes that have no candidate peaks left, are ignored.
if (!count($aResults[$sGene]['positions'])) {
unset($aResults[$sGene]);
}
}
$nGenesLeft = count($aResults);
$nGenesDiscarded = $nGenesMapped - $nGenesLeft;
print(' done.' . "\n");
fputs($aFilesOut['stats']['handler'],
'Number of reads left:' . "\t" . $nTotalCoverageForAllPositions . "\n" .
'Genes mapped to:' . "\t" . $nGenesMapped . "\n" .
'Genes discarded, no peaks found:' . "\t" . $nGenesDiscarded . "\n" .
'Genes left with found translation start sites:' . "\t" . $nGenesLeft . "\n");
////////////////////////////////////////////////////////////////////////////////
/*
+ Problem: Mutalyzer describes positions after the stop codon with an asterisk
followed by the distance to the stop codon, e.g. *3. Since this script does
not know the transcript lengths at the moment, it is unaware of the distance
between a position in the coding region (e.g. 4623) and a position after the
stop codon. Therefore, the checks of the coverage of the surrounding positions
of a candidate translation start site will be incorrect or not be possible
when this site is located around the annotated stop codon.
This may cause false positives around the translation stop site, but not false
negatives.
*/
fputs($aFilesOut['results_after_cutoff']['handler'], 'The following results are found in the coding region, at a distance to the TIS of at least ' . $_SETT['peak_finding']['candidate_min_distance_reported_separately'] . ' nucleotides.' . "\n\n");
foreach ($aResults as $sGene => $aGene) {
$nOriCandidates = count($aPositionsPerGene[$sGene]['unique_positions']);
$nCandidatesAnalyzed = count($aPositionsPerGene[$sGene]['unique_positions_analyzed']);
$nResults = count($aResults[$sGene]['positions']);
$aTranscripts = array_keys($aPositionsPerGene[$sGene]['positions']);
if ($aGene['report_separately']) {
// Gene has at least one position that needs to be reported separately.
fputs($aFilesOut['results_after_cutoff']['handler'], $sGene . "\t" . 'Positions found:' . "\t" . $nOriCandidates . "\t" . 'Positions analyzed:' . "\t" . $nCandidatesAnalyzed . "\t" . 'TIS found:' . "\t" . $nResults . "\n" .
'G_Position' . "\t" . 'Coverage' . "\t" . implode("\t", $aTranscripts) . "\n");
}
if ($aGene['report_separately'] < count($aGene['positions'])) {
// Gene has at least one position that needs to be reported in the normal results file.
fputs($aFilesOut['results']['handler'], $sGene . "\t" . 'Positions found:' . "\t" . $nOriCandidates . "\t" . 'Positions analyzed:' . "\t" . $nCandidatesAnalyzed . "\t" . 'TIS found:' . "\t" . $nResults . "\n" .
'G_Position' . "\t" . 'Coverage' . "\t" . implode("\t", $aTranscripts) . "\n");
}
if ($sArgStrand == '+') {
ksort($aGene['positions']);
} else {
krsort($aGene['positions']);
}
foreach ($aGene['positions'] as $nPosition => $aPosition) {
$sFileHandler = (isset($aPosition['report_separately'])? 'results_after_cutoff' : 'results');
fputs($aFilesOut[$sFileHandler]['handler'], $aPositionsPerGene[$sGene]['chr'] . ':' . $nPosition . "\t" . $aCoverages[$aPositionsPerGene[$sGene]['chr']][$nPosition]);
foreach ($aTranscripts as $sTranscript) {
if (isset($aPosition[$sTranscript])) {
fputs($aFilesOut[$sFileHandler]['handler'], "\t" . $aPosition[$sTranscript]);
} else {
fputs($aFilesOut[$sFileHandler]['handler'], "\t-");
}
}
fputs($aFilesOut[$sFileHandler]['handler'], "\n");
}
if ($aGene['report_separately']) {
fputs($aFilesOut['results_after_cutoff']['handler'], "\n");
}
if ($aGene['report_separately'] < count($aGene['positions'])) {
fputs($aFilesOut['results']['handler'], "\n");
}
}
print("\n");
exit(0);
?>