forked from chiulab/surpi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
split_fasta.pl
executable file
·98 lines (76 loc) · 2.52 KB
/
split_fasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/perl
#split_fasta.pl version 1.0
#This script accepts a file consisting of multiple FASTA formatted sequence records.
#It splits the file into multiple new files, each consisting of a subset of the original records.
#
#There are three command line options:
#
#-i input file.
#-o output file prefix. This script will append numbers to this prefix name so that each created file is unique.
#-n the number of sequences to place in each output file.
#
#Example usage:
#
#perl split_fasta.pl -i sample_in.txt -o new_sequences -n 100
#
#Written by Paul Stothard, Canadian Bioinformatics Help Desk.
#
#stothard@ualberta.ca
use strict;
use warnings;
#Command line processing.
use Getopt::Long;
my $inputFile;
my $outputFile;
my $numberToCopy;
Getopt::Long::Configure ('bundling');
GetOptions ('i|input_file=s' => \$inputFile,
'o|output_file_prefix=s' => \$outputFile,
'n|number=i' => \$numberToCopy);
if(!defined($inputFile)) {
die ("Usage: split_fasta.pl -i <input file> -o <output file> -n <number of sequences to write per file>\n");
}
if(!defined($outputFile)) {
die ("Usage: split_fasta.pl -i <input file> -o <output file> -n <number of sequences to write per file>\n");
}
if(!defined($numberToCopy)) {
die ("Usage: split_fasta.pl -i <input file> -o <output file> -n <number of sequences to write per file>\n");
}
if ($numberToCopy <= 0) {
die ("-n value must be greater than 0.\n");
}
#count the number of sequences in the file
#read each record from the input file
my $seqCount = 0;
my $fileCount = 0;
my $seqThisFile = 0;
open (OUTFILE, ">" . $outputFile . "_" . $fileCount) or die ("Cannot open file for output: $!");
open (SEQFILE, $inputFile) or die( "Cannot open file : $!" );
$/ = ">";
while (my $sequenceEntry = <SEQFILE>) {
if ($sequenceEntry =~ m/^\s*>/){
next;
}
my $sequenceTitle = "";
if ($sequenceEntry =~ m/^([^\n]+)/){
$sequenceTitle = $1;
}
else {
$sequenceTitle = "No title was found!";
}
$sequenceEntry =~ s/^[^\n]+//;
$sequenceEntry =~ s/[^A-Za-z]//g;
#write record to file
print (OUTFILE ">$sequenceTitle\n");
print (OUTFILE "$sequenceEntry\n");
$seqCount++;
$seqThisFile++;
if ($seqThisFile == $numberToCopy) {
$fileCount++;
$seqThisFile = 0;
close (OUTFILE) or die( "Cannot close file : $!");
open (OUTFILE, ">" . $outputFile . "_" . $fileCount) or die ("Cannot open file for output: $!");
}
}#end of while loop
close (SEQFILE) or die( "Cannot close file : $!");
close (OUTFILE) or die( "Cannot close file : $!");