-
Notifications
You must be signed in to change notification settings - Fork 7
/
PDB_headers.pl
executable file
·199 lines (174 loc) · 4.73 KB
/
PDB_headers.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/perl
## Pombert Lab 2020
my $version = '0.4b';
my $name = 'PDB_headers.pl';
my $updated = '2022-02-23';
use strict;
use warnings;
use Getopt::Long qw(GetOptions);
use File::Basename;
use File::Find;
use PerlIO::gzip;
## Usage definition
my $USAGE = <<"OPTIONS";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS Generates a Tab-delimited list of PDB structures, their titles and
chains from the PDB files headers
REQUIREMENTS PDB files downloaded from RCSB PDB; e.g. pdb2zvl.ent.gz
PerlIO::gzip
USAGE EXAMPLE ${name} \\
-p RCSB_PDB/ RCSB_PDB_obsolete/\\
-o RCSB_PDB_titles.tsv \\
-v 1000
OPTIONS:
-p (--pdb) Directories containing PDB files downloaded from RCSB PDB/PDBe (gzipped)
-o (--output) Output file in tsv format
-f (--force) Regenerate all PDB titles ## Default off
-v (--verbose) Prints progress every X file [Default: 1000]
OPTIONS
die "\n$USAGE\n" unless @ARGV;
## Defining options
my @pdbs;
my $rcsb_list;
my $force;
my $verbose = 1000;
GetOptions(
'p|pdb=s@{1,}' => \@pdbs,
'o|output=s' => \$rcsb_list,
'f|force' => \$force,
'v|verbose=i' => \$verbose
);
## Recursing through PDB directory
my @pdb;
for my $dir (@pdbs){
find(
sub { push @pdb, $File::Find::name unless -d; },
$dir
);
}
## Doing a first pass to see which files have been parsed previously
## Should reduce overall computation time by skipping parsing
my %previous_data;
my $diamond = '>';
unless ($force){
if (-f $rcsb_list){
$diamond = '>>';
open LIST, "<", "$rcsb_list" or die "Can't read $rcsb_list: $!\n";
while (my $line = <LIST>){
chomp $line;
if ($line =~ /^(\w+)/){
my $rcsb_entry = $1;
$previous_data{$rcsb_entry} = 1;
}
}
close LIST;
}
}
## Parsing PDB files (*.ent.gz)
open OUT, "$diamond", "$rcsb_list" or die "Can't create file $rcsb_list: $!\n";
my $pdb_count = 0;
my $start = time;
while (my $pb = shift@pdb){
if ($pb =~ /.ent.gz$/){ ## skipping other files if present
$pdb_count++;
## Grabbing RCSB PDB entry name from pdb file
my ($pdb, $folder) = fileparse($pb);
$pdb =~ s/^pdb//;
$pdb =~ s/.ent.gz$//;
## verbosity; lots of files to parse...
my $modulo = ($pdb_count % $verbose);
my $current_count = commify($pdb_count);
if ($modulo == 0){ print "Working on PDB file # $current_count: $pb\n"; }
## Working on PDB if is has not been seen before
if (exists $previous_data{$pdb}) { next; }
else {
open PDB, "<:gzip", "$pb" or die "Can't open file $pb: $!\n";
my $title = undef;
my %molecules;
my $mol_id = undef;
while (my $line = <PDB>){
chomp $line;
## Getting title info from TITLE entries
if ($line =~ /^TITLE\s{5}(.*)$/){
my $key = $1;
$key =~ s/\s+$/ /; ## Discard trailing space characters
$title .= $key;
}
elsif ($line =~ /^TITLE\s+\d+\s(.*)$/){
my $key = $1;
$key =~ s/\s+$/ /; ## Discard trailing space characters
$title .= $key;
}
## Getting chain information from COMPND entries
elsif ($line =~ /^COMPND\s+(\d+)?\s?MOL_ID:\s(\d+)/){
$mol_id = $2;
}
elsif ($line =~ /^COMPND\s+(\d+)?(.*)$/){
my $data = $2;
$data =~ s/\s+$//;
if ($mol_id){
$molecules{$mol_id} .= $data;
}
else {
print STDERR "[W] $pdb is missing a molecule ID\n";
}
}
}
binmode PDB, ":gzip(none)";
## Printing title
if (defined $title){
print OUT "$pdb\tTITLE\t$title\n";
}
else{
print STDERR "[W] $pdb is missing a title\n";
}
## Printing chain(s)
foreach my $id (sort (keys %molecules)){
my $molecule;
my $chains;
if ($molecules{$id} =~ /MOLECULE: (.*?);/){
$molecule = $1;
}
elsif($molecules{$id} =~ /MOLECULE: (.*?)/){
$molecule = $1;
## If at end of COMPND section, no semicolon to after the molecule(s)
}
if ($molecules{$id} =~ /CHAIN: (.*?);/){
$chains = $1;
}
elsif ($molecules{$id} =~ /CHAIN: (.*\w)/){
$chains = $1;
## If at end of COMPND section, no semicolon to after the chain(s)
}
my @chains;
if(defined $chains){
$chains =~ s/ //g;
@chains = split (",", $chains);
foreach my $chain (@chains){
if (defined $molecule){ print OUT "$pdb\t$chain\t$molecule\n"; }
## Molecules might not be defined if engineered
else {
print OUT "$pdb\t$chain\tundefined molecule\n";
}
}
}
else{
print STDERR "[W] $pdb is missing a chain\n";
}
}
}
}
}
my $final_count = commify($pdb_count);
my $run_time = (time - $start)/60;
$run_time = sprintf ("%.2f", $run_time);
print "\nIterated through a total of $final_count PDB files\n";
print "Job completed in $run_time minutes.\n\n";
### Subroutine(s)
sub commify {
my $text = reverse $_[0];
$text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g;
return scalar reverse $text;
}