-
Notifications
You must be signed in to change notification settings - Fork 14
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[issue/273] dbSNP 2 VCF reformat util #300
Changes from 10 commits
e381a56
221ec00
a104a5e
a9b0614
e81355d
dbf6d92
70d7abf
63f8a69
3bddd45
6009aee
fbb00bd
0038814
ae3962f
a977d43
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
#!/usr/bin/perl | ||
use 5.10.0; | ||
use strict; | ||
use warnings; | ||
use DDP; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you still need DDP after development is done? |
||
|
||
# Take a DbSNP 2 VCF file, and for each row, split the INFO field's FREQ data into separate INFO fields for each population | ||
package Utils::DbSnp2FormatInfo; | ||
|
||
our $VERSION = '0.001'; | ||
|
||
use File::Basename qw/basename/; | ||
|
||
use Mouse 2; | ||
use namespace::autoclean; | ||
use Path::Tiny qw/path/; | ||
|
||
use Seq::Tracks::Build::LocalFilesPaths; | ||
|
||
# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path | ||
extends 'Utils::Base'; | ||
|
||
sub BUILD { | ||
my $self = shift; | ||
|
||
my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new(); | ||
|
||
my $localFilesAref = $localFilesHandler->makeAbsolutePaths( | ||
$self->_decodedConfig->{files_dir}, | ||
$self->_wantedTrack->{name}, | ||
$self->_wantedTrack->{local_files} | ||
); | ||
|
||
$self->{_localFiles} = $localFilesAref; | ||
} | ||
|
||
# TODO: error check opening of file handles, write tests | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this comment still in force? (looks like we have tests now for this module) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no it isn't, thanks |
||
sub go { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function seems like it's doing a lot of different things at a few different levels of abstraction. One way to address that might be to break it up into helpers roughly along the lines of:
But I defer to your judgment as to whether the juice is worth the squeeze there? |
||
my $self = shift; | ||
|
||
my @output_paths; | ||
|
||
for my $input_vcf ( @{ $self->{_localFiles} } ) { | ||
if ( !-e $input_vcf ) { | ||
$self->log( 'fatal', "input file path $input_vcf doesn't exist" ); | ||
return; | ||
} | ||
|
||
my ( $err, $isCompressed, $in_fh ) = $self->getReadFh($input_vcf); | ||
|
||
$isCompressed ||= $self->compress; | ||
|
||
if ($err) { | ||
$self->log( 'fatal', $err ); | ||
return; | ||
} | ||
|
||
my $base_name = basename($input_vcf); | ||
$base_name =~ s/\.[^.]+$//; # Remove last file extension (if present) | ||
$base_name | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how many different kinds of file extensions are we likely to have to handle here, and could we match explicitly on those instead of sedding off two extensions? The worry is someone uploading a file named something like foo.bar.vcf, which would get pared down to foo [?] |
||
=~ s/\.[^.]+$//; # Remove another file extension if it's something like .vcf.gz | ||
|
||
my $output_vcf_data = $base_name . "_vcf_data.vcf" . ( $isCompressed ? ".gz" : "" ); | ||
my $output_vcf_header = | ||
$base_name . "_vcf_header.vcf" . ( $isCompressed ? ".gz" : "" ); | ||
my $output_vcf = $base_name . "_processed.vcf" . ( $isCompressed ? ".gz" : "" ); | ||
|
||
$self->log( 'info', "Reading $input_vcf" ); | ||
|
||
my $output_header_path = | ||
path( $self->_localFilesDir )->child($output_vcf_header)->stringify(); | ||
my $output_data_path = | ||
path( $self->_localFilesDir )->child($output_vcf_data)->stringify(); | ||
my $output_path = path( $self->_localFilesDir )->child($output_vcf)->stringify(); | ||
|
||
if ( ( -e $output_data_path || -e $output_header_path || -e $output_path ) | ||
&& !$self->overwrite ) | ||
{ | ||
$self->log( 'fatal', | ||
"Temp files $output_data_path, $output_header_path, or final output path $output_path exist, and overwrite is not set" | ||
); | ||
return; | ||
} | ||
|
||
my $output_data_fh = $self->getWriteFh($output_data_path); | ||
|
||
$self->log( 'info', "Writing to $output_data_path" ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Everything to this point is getting setup to do actual work. This seems like a logical place to split apart the function into two sections, which will also aid testing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
|
||
# Store populations seen across the VCF | ||
my %populations; | ||
|
||
my @header_lines; | ||
while (<$in_fh>) { | ||
chomp; | ||
|
||
# If it's a header line | ||
if (/^#/) { | ||
push @header_lines, $_; | ||
next; | ||
} | ||
|
||
# If it's an INFO line | ||
if (/FREQ=/) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm probably missing context here but this comment / condition pair is a little surprising because we're not matching against There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I found this curious too. This is actually a clever solution, unless I'm missing something. ChatGPT came up with this, and I ended up keeping it because it was cute and quite elegant. It checks rows for the FREQ=. If it doesn't find that, there's nothing to do but add the row as is. FREQ isn't guaranteed to be in the INFO field. It is also not found any other field, and never will be. Then we split on ";". That will result in 1 field that contains everything up until the first INFO value, and the rest of the info values. We find the field containing FREQ=, extract the FREQ=(.*) value, expand the individual population POP=VAL as new INFO fields, join by ";" to the first field, which results in a complete file with correct delimiters. I would have written this by splitting each field by "\t", then search INFO, but didn't see any downsides with this approach. FREQ is guaranteed to never appear 2x, though it could potentially appear 0 times, and this handles that case :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. INFO is a positional field so what Alex is searching for is some value within that field. I see the logic. I would use index to find "FREQ=" rather than a regular expression but that's just my preference. My reading of the regexps is that they would catch some non-numeric characters and your tests suggest it catches trailing stuff. I'm not sure how important that is for you. |
||
my @info_fields = split( /;/, $_ ); | ||
my @new_info_fields; | ||
my %freqs; | ||
|
||
foreach my $info (@info_fields) { | ||
if ( $info =~ /FREQ=(.+)/ ) { | ||
my $freq_data = $1; | ||
my @pops = split( /\|/, $freq_data ); | ||
|
||
foreach my $pop (@pops) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like the heart of what you're doing - how about making it a func that you'd test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I sent you my suggestion for sharding the functions and testing via teams. |
||
if ( $pop =~ /([^:]+):(.+)/ ) { | ||
my $pop_name = $1; | ||
my @freq_vals = split( /,/, $2 ); | ||
shift @freq_vals; # Remove the reference allele freq | ||
$freqs{$pop_name} = join( ",", @freq_vals ); | ||
$populations{$pop_name} = 1; | ||
} | ||
} | ||
} | ||
else { | ||
push @new_info_fields, $info; # Keep the existing INFO fields | ||
} | ||
} | ||
|
||
# Append the new frequency data to the INFO field | ||
foreach my $pop_name ( keys %freqs ) { | ||
push @new_info_fields, "$pop_name=$freqs{$pop_name}"; | ||
} | ||
|
||
say $output_data_fh join( ";", @new_info_fields ); | ||
} | ||
} | ||
|
||
close($in_fh); | ||
close($output_data_fh); | ||
|
||
# Update the VCF header with new populations | ||
my @pop_lines; | ||
foreach my $pop ( keys %populations ) { | ||
push @pop_lines, | ||
"##INFO=<ID=$pop,Number=A,Type=Float,Description=\"Frequency for $pop\">"; | ||
} | ||
|
||
splice( @header_lines, -1, 0, @pop_lines ); | ||
|
||
my $header_fh = $self->getWriteFh($output_header_path); | ||
|
||
# Write the updated header and VCF to output | ||
say $header_fh join( "\n", @header_lines ); | ||
close($header_fh); | ||
|
||
system("cat $output_header_path $output_data_path > $output_path") == 0 | ||
or die "Failed to concatenate files: $?"; | ||
system("rm $output_header_path $output_data_path") == 0 | ||
or die "Failed to remove temporary files: $?"; | ||
|
||
$self->log( 'info', "$input_vcf processing complete" ); | ||
|
||
push @output_paths, $output_path; | ||
} | ||
|
||
$self->_wantedTrack->{local_files} = \@output_paths; | ||
|
||
$self->_backupAndWriteConfig(); | ||
} | ||
|
||
__PACKAGE__->meta->make_immutable; | ||
1; |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
#!/usr/bin/perl | ||
use strict; | ||
use warnings; | ||
use Test::More; | ||
|
||
use Path::Tiny; | ||
use YAML::XS qw/DumpFile/; | ||
|
||
use Utils::DbSnp2FormatInfo; | ||
|
||
# create temp directories | ||
my $db_dir = Path::Tiny->tempdir(); | ||
my $raw_dir = Path::Tiny->tempdir(); | ||
|
||
my $vcf_path = $raw_dir->child('test.vcf')->stringify; | ||
my $expected_output_vcf_path = | ||
$raw_dir->child('dbSNP/test_processed.vcf')->stringify; | ||
|
||
my $config = { | ||
'assembly' => 'hg38', | ||
'chromosomes' => ['chr1'], | ||
'database_dir' => $db_dir->stringify, | ||
'files_dir' => $raw_dir->stringify, | ||
'tracks' => { | ||
'tracks' => [ | ||
{ | ||
'local_files' => [$vcf_path], | ||
'name' => 'dbSNP', | ||
'sorted' => 1, | ||
'type' => 'vcf', | ||
'utils' => [ { 'name' => 'DbSnp2FormatInfo' } ] | ||
} | ||
] | ||
} | ||
}; | ||
|
||
# write temporary config file | ||
my $config_file = $raw_dir->child('filterCadd.yml'); | ||
DumpFile( $config_file, $config ); | ||
|
||
# Prepare a sample VCF for testing | ||
my $vcf_data = <<'END_VCF'; | ||
##fileformat=VCFv4.1 | ||
##INFO=<ID=RS,Number=1,Type=String,Description="dbSNP ID"> | ||
##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description="dbSNP Build ID"> | ||
##INFO=<ID=SSR,Number=0,Type=Flag,Description="Variant is a short tandem repeat"> | ||
#CHROM POS ID REF ALT QUAL FILTER INFO | ||
NC_000001.11 10001 rs1570391677 T A,C . . RS=1570391677;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;FREQ=KOREAN:0.9891,0.0109,.|SGDP_PRJ:0,1,.|dbGaP_PopFreq:1,.,0 | ||
NC_000001.11 10002 rs1570391692 A C . . RS=1570391692;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;FREQ=KOREAN:0.9944,0.005597 | ||
END_VCF | ||
|
||
# Write sample VCF to a temporary file | ||
open my $fh, '>', $vcf_path or die "Could not open $vcf_path: $!"; | ||
print $fh $vcf_data; | ||
close $fh; | ||
|
||
# Initialize the utility and process the VCF | ||
my $utility = Utils::DbSnp2FormatInfo->new( | ||
{ | ||
config => $config_file, | ||
name => 'dbSNP', | ||
utilName => 'DbSnp2FormatInfo' | ||
} | ||
); | ||
|
||
$utility->go($vcf_path); | ||
|
||
# Check that the processed file exists and is correctly formatted | ||
ok( -e $expected_output_vcf_path, "Processed VCF file exists" ); | ||
|
||
# Read the processed file to check the INFO field | ||
$fh = path($expected_output_vcf_path)->openr; | ||
|
||
ok( <$fh> == "##fileformat=VCFv4.1", 'VCF fileformat is correctly processed' ); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're comparing a string using e.g., $fh = path($expected_output_vcf_path)->openr;
my $str = <$fh>;
chomp $str;
ok( $str eq "##fileformat=VCFv4.1", 'VCF fileformat is correctly processed' );
$str = <$fh>;
chomp $str;
ok( $str eq "##INFO=<ID=RS,Number=1,Type=String,Description=\"dbSNP ID\">",
'RS population is correctly processed' ); |
||
ok( <$fh> == "##INFO=<ID=RS,Number=1,Type=String,Description=\"dbSNP ID\">", | ||
'RS population is correctly processed' ); | ||
ok( | ||
<$fh> | ||
== "##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description=\"dbSNP Build ID\">", | ||
'dbSNPBuildID population is correctly processed' | ||
); | ||
ok( | ||
<$fh> | ||
== "##INFO=<ID=SSR,Number=0,Type=Flag,Description=\"Variant is a short tandem repeat\">", | ||
'SSR population is correctly processed' | ||
); | ||
ok( | ||
<$fh> | ||
== "##INFO=<ID=KOREAN,Number=A,Type=Float,Description=\"Frequency for KOREAN\">", | ||
'KOREAN population is correctly processed' | ||
); | ||
ok( | ||
<$fh> | ||
== "##INFO=<ID=SGDP_PRJ,Number=A,Type=Float,Description=\"Frequency for SGDP_PRJ\">", | ||
'SGDP_PRJ population is correctly processed' | ||
); | ||
ok( | ||
<$fh> | ||
== "##INFO=<ID=dbGaP_PopFreq,Number=A,Type=Float,Description=\"Frequency for dbGaP_PopFreq\">", | ||
'dbGaP_PopFreq population is correctly processed' | ||
); | ||
ok( <$fh> == "#CHROM POS ID REF ALT QUAL FILTER INFO" ); | ||
|
||
ok( | ||
<$fh> | ||
== "NC_000001.11 10001 rs1570391677 T A,C . . RS=1570391677;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;KOREAN=0.0109,.;SGDP_PRJ=0,.;dbGaP_PopFreq=.,0", | ||
'1st data row wiht KOREAN, SGDP_PRJ, dbGap freqs are correctly processed' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: wiht -> with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. bah :) need CI spellcheck, clearly thanks |
||
); | ||
ok( | ||
<$fh> | ||
== "NC_000001.11 10002 rs1570391692 A C . . RS=1570391692;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;KOREAN=0.005597", | ||
'2nd data row with KOREAN freq is correctly processed' | ||
); | ||
|
||
done_testing(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you think about adding or switching to checking magic strings to deduce the compression type rather than matching extensions exclusively? File::LibMagic is a Perl package that binds to the c library that seems like it would be a good fit and probably much easier to use than our own implementation for checking magic strings.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sounds reasonable, though beyond the scope of this PR, as the file extension solution is used everywhere (the change here was to remove accepting $innerFile, which was not handled if provided). I have a tracking ticket #312 to evaluate the switch to File::LibMagic, scheduled for Sprint 4.