Skip to content

Commit

Permalink
[issue/273] dbSNP 2 VCF reformat util (#300)
Browse files Browse the repository at this point in the history
* Takes N DbSNP 2 VCF files
(https://www.ncbi.nlm.nih.gov/snp/docs/products/vcf/redesign/), extracts
every population from the Freq=(.*) field in a separate INFO field,
drops the Freq field, drops the first allele for each population (which
is reference). Then it writes the population-specific fields to the info
header, and updates the yml config file to point to the newly formatted
vcf (the original vcf will not be overwritten).

This is necessary because the dbSNP 2 vcf does not make good use of the
VCF spec; the Freq field is the combination of multiple fields, each of
which is an Allelic type, but where the first allele is the reference,
which is not the standard use.

This will enable us to reproducibly fetch, transform, build dbSNP files,
from 1 yaml config, once we add 1 more utility, which translates the
RefSeq NC_* chromosome identifiers to chr1-22,X,Y,M.


Will remove [wip] once test added

---------

Co-authored-by: wingolab <thomas.wingo@emory.edu>
  • Loading branch information
akotlar and wingolab authored Oct 23, 2023
1 parent 041f051 commit 37b4cfe
Show file tree
Hide file tree
Showing 6 changed files with 356 additions and 48 deletions.
1 change: 1 addition & 0 deletions perl/bin/bystro-utils.pl
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
use Utils::RenameTrack;
use Utils::FilterCadd;
use Utils::RefGeneXdbnsfp;
use Utils::DbSnp2FormatInfo;

use Seq::Build;

Expand Down
55 changes: 10 additions & 45 deletions perl/lib/Seq/Role/IO.pm
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ sub getReadArgs {
return $outerCommand;
}
#@param {Path::Tiny} $file : the Path::Tiny object representing a single input file
#@param {Str} $innerFile : if passed a tarball, we will want to stream a single file within
#@param {Str} $errCode : what log level to use if we can't open the file
#@return file handle
sub getReadFh {
my ( $self, $file, $innerFile, $errCode ) = @_;
my ( $self, $file, $errCode ) = @_;

# By default, we'll return an error, rather than die-ing with log
# We wont be able to catch pipe errors however
Expand All @@ -122,53 +122,18 @@ sub getReadFh {

my $outerCommand = $self->getReadArgs($filePath);

if ( !$innerFile ) {
my ( $err, $fh );
my ( $err, $fh );

if ($outerCommand) {
$err = $self->safeOpen( $fh, '-|', "$outerCommand", $errCode );
}
else {
$err = $self->safeOpen( $fh, '<', $filePath, $errCode );
}

my $compressed = !!$outerCommand;

return ( $err, $compressed, $fh );
if ($outerCommand) {
$err = $self->safeOpen( $fh, '-|', "$outerCommand", $errCode );
}
else {
$err = $self->safeOpen( $fh, '<', $filePath, $errCode );
}

# TODO: Check that file exists again
# my $filePath;
# if($remoteCpCmd) {
# $filePath =
# if(ref $file ne 'Path::Tiny' ) {
# $file = path($file)->absolute;

# my $filePath = $file->stringify;
# }

# if (!$file->is_file) {
# my $err = "$filePath does not exist for reading";
# $self->log($errCode, $err);

# return ($err, undef, undef);
# }

# TODO: FIX
# if($innerFile) {
# my ($err, $compressed, $command) = $self->getInnerFileCommand($filePath, $innerFile, $errCode);

# if($err) {
# return ($err, undef, undef);
# }

# $err = $self->safeOpen(my $fh, '-|', $command, $errCode);

# # If an innerFile is passed, we assume that $file is a path to a tarball

# return ($err, $compressed, $fh);
# }
my $compressed = !!$outerCommand;

return ( $err, $compressed, $fh );
}

sub getRemoteProg {
Expand Down
211 changes: 211 additions & 0 deletions perl/lib/Utils/DbSnp2FormatInfo.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
#!/usr/bin/perl
use 5.10.0;
use strict;
use warnings;

# Take a DbSNP 2 VCF file, and for each row, split the INFO field's FREQ data into separate INFO fields for each population
# NOTE: dbSNP VCF spec: https://www.ncbi.nlm.nih.gov/snp/docs/products/vcf/redesign/
# NOTE: that dbSNP uses a '.' to represent a missing value and first allele is the reference, which is not the standard use.

package Utils::DbSnp2FormatInfo;

our $VERSION = '0.001';

use File::Basename qw/basename/;

use Mouse 2;
use namespace::autoclean;
use Path::Tiny qw/path/;

use Seq::Tracks::Build::LocalFilesPaths;

# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path
extends 'Utils::Base';

my $INFO_INDEX = 7;

sub BUILD {
my $self = shift;

my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new();

my $localFilesAref = $localFilesHandler->makeAbsolutePaths(
$self->_decodedConfig->{files_dir},
$self->_wantedTrack->{name},
$self->_wantedTrack->{local_files}
);

$self->{_localFiles} = $localFilesAref;
}

sub _get_fh_paths {
my ( $self, $input_vcf ) = @_;

if ( !-e $input_vcf ) {
$self->log( 'fatal', "input file path $input_vcf doesn't exist" );
return;
}

my ( $err, $isCompressed, $in_fh ) = $self->getReadFh($input_vcf);

$isCompressed ||= $self->compress;

if ($err) {
$self->log( 'fatal', $err );
return;
}

my $base_name = basename($input_vcf);
$base_name =~ s/\.[^.]+$//; # Remove last file extension (if present)
$base_name
=~ s/\.[^.]+$//; # Remove another file extension if it's something like .vcf.gz

my $output_vcf_data = $base_name . "_vcf_data.vcf" . ( $isCompressed ? ".gz" : "" );
my $output_vcf_header =
$base_name . "_vcf_header.vcf" . ( $isCompressed ? ".gz" : "" );
my $output_vcf = $base_name . "_processed.vcf" . ( $isCompressed ? ".gz" : "" );

$self->log( 'info', "Reading $input_vcf" );

my $output_header_path =
path( $self->_localFilesDir )->child($output_vcf_header)->stringify();
my $output_data_path =
path( $self->_localFilesDir )->child($output_vcf_data)->stringify();
my $output_path = path( $self->_localFilesDir )->child($output_vcf)->stringify();

if ( ( -e $output_data_path || -e $output_header_path || -e $output_path )
&& !$self->overwrite )
{
$self->log( 'fatal',
"Temp files $output_data_path, $output_header_path, or final output path $output_path exist, and overwrite is not set"
);
return;
}

return ( $in_fh, $output_data_path, $output_header_path, $output_path );
}

sub go {
my $self = shift;

my @output_paths;

for my $input_vcf ( @{ $self->{_localFiles} } ) {
my ( $in_fh, $output_data_path, $output_header_path, $output_path ) =
$self->_get_fh_paths($input_vcf);

my $output_data_fh = $self->getWriteFh($output_data_path);

$self->log( 'info', "Writing to $output_data_path" );

my %populations;
my @ordered_populations;

my @header_lines;
while (<$in_fh>) {
chomp;

# If it's a header line
if (/^#/) {
push @header_lines, $_;
next;
}

my @fields = split( "\t", $_ );

if ( !@fields ) {
$self->log( "fatal", "No fields found in row: $_" );
return;
}

my @info_fields = split( ";", $fields[$INFO_INDEX] );

my @ordered_info_freqs;
my %seen_info_pops;

my $seen_freq = 0;
foreach my $info (@info_fields) {
if ( $info =~ /FREQ=(.+)/ ) {
if ( $seen_freq == 1 ) {
$self->log( "fatal", "FREQ seen twice in INFO field. Row: $_" );
return;
}

$seen_freq = 1;

my $freq_data = $1;
my @pops = split( /\|/, $freq_data );

foreach my $pop (@pops) {
if ( $pop =~ /([^:]+):(.+)/ ) {
my $pop_name = $1;

if ( exists $seen_info_pops{$pop_name} ) {
self->log( "fatal", "Population $pop_name seen twice in INFO field. Row: $_" );
return;
}

my @freq_vals = split( /,/, $2 );
shift @freq_vals; # Remove the reference allele freq

push @ordered_info_freqs, [ $pop_name, join( ",", @freq_vals ) ];

if ( !exists $populations{$pop_name} ) {
push @ordered_populations, $pop_name;
$populations{$pop_name} = 1;
}
}
}

# Append the new frequency data to the INFO field
my @new_info_fields;
for my $res (@ordered_info_freqs) {
my $name = $res->[0];
my $freq = $res->[1];
push @new_info_fields, "$name=$freq";
}

$info = join( ";", @new_info_fields );
}
}

$fields[$INFO_INDEX] = join( ";", @info_fields );

say $output_data_fh join( "\t", @fields );
}

close($in_fh);
close($output_data_fh);

# Update the VCF header with new populations
my @pop_lines;
foreach my $pop (@ordered_populations) {
push @pop_lines,
"##INFO=<ID=$pop,Number=A,Type=Float,Description=\"Frequency for $pop\">";
}

splice( @header_lines, -1, 0, @pop_lines );

my $header_fh = $self->getWriteFh($output_header_path);

# Write the updated header and VCF to output
say $header_fh join( "\n", @header_lines );
close($header_fh);

system("cat $output_header_path $output_data_path > $output_path") == 0
or die "Failed to concatenate files: $?";
system("rm $output_header_path $output_data_path") == 0
or die "Failed to remove temporary files: $?";

$self->log( 'info', "$input_vcf processing complete" );

push @output_paths, $output_path;
}

$self->_wantedTrack->{local_files} = \@output_paths;

$self->_backupAndWriteConfig();
}

__PACKAGE__->meta->make_immutable;
1;
2 changes: 1 addition & 1 deletion perl/lib/Utils/LiftOverCadd.pm
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ sub go {
for my $inPath (@$localFilesPathsAref) {
$self->log( 'info', "Beginning to lift over $inPath" );

my ( undef, $isCompressed, $inFh ) = $self->getReadFh( $inPath, undef, 'fatal' );
my ( undef, $isCompressed, $inFh ) = $self->getReadFh( $inPath, 'fatal' );

my $baseName = path($inPath)->basename;

Expand Down
4 changes: 2 additions & 2 deletions perl/lib/Utils/SortCadd.pm
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ sub go {
$self->log( 'info', "Out path base: $outPathBase" );
$self->log( 'info', "Reading input file: $inFilePath" );

my ( undef, $compressed, $readFh ) = $self->getReadFh( $inFilePath, undef, 'fatal' );
my ( undef, $compressed, $readFh ) = $self->getReadFh( $inFilePath, 'fatal' );

my $versionLine = <$readFh>;
my $headerLine = <$readFh>;
Expand Down Expand Up @@ -155,7 +155,7 @@ sub go {
for my $outPath (@outPaths) {
my $gzipPath = $self->gzip;

my ( undef, $compressed, $fh ) = $self->getReadFh( $outPath, undef, 'fatal' );
my ( undef, $compressed, $fh ) = $self->getReadFh( $outPath, 'fatal' );

my $outExt = '.sorted' . $outExtPart;

Expand Down
Loading

0 comments on commit 37b4cfe

Please sign in to comment.