bystrogenomics · akotlar · Oct 23, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 18, 2023
diff --git a/perl/bin/bystro-utils.pl b/perl/bin/bystro-utils.pl
@@ -19,6 +19,7 @@
 use Utils::RenameTrack;
 use Utils::FilterCadd;
 use Utils::RefGeneXdbnsfp;
+use Utils::DbSnp2FormatInfo;
 
 use Seq::Build;
 

diff --git a/perl/lib/Seq/Role/IO.pm b/perl/lib/Seq/Role/IO.pm
@@ -101,10 +101,10 @@ sub getReadArgs {
   return $outerCommand;
 }
 #@param {Path::Tiny} $file : the Path::Tiny object representing a single input file
-#@param {Str} $innerFile : if passed a tarball, we will want to stream a single file within
+#@param {Str} $errCode : what log level to use if we can't open the file
 #@return file handle
 sub getReadFh {
-  my ( $self, $file, $innerFile, $errCode ) = @_;
+  my ( $self, $file, $errCode ) = @_;
 
   # By default, we'll return an error, rather than die-ing with log
   # We wont be able to catch pipe errors however
@@ -122,53 +122,18 @@ sub getReadFh {
 
   my $outerCommand = $self->getReadArgs($filePath);
 
-  if ( !$innerFile ) {
-    my ( $err, $fh );
+  my ( $err, $fh );
 
-    if ($outerCommand) {
-      $err = $self->safeOpen( $fh, '-|', "$outerCommand", $errCode );
-    }
-    else {
-      $err = $self->safeOpen( $fh, '<', $filePath, $errCode );
-    }
-
-    my $compressed = !!$outerCommand;
-
-    return ( $err, $compressed, $fh );
+  if ($outerCommand) {
+    $err = $self->safeOpen( $fh, '-|', "$outerCommand", $errCode );
+  }
+  else {
+    $err = $self->safeOpen( $fh, '<', $filePath, $errCode );
   }
 
-  # TODO: Check that file exists again
-  # my $filePath;
-  # if($remoteCpCmd) {
-  #   $filePath =
-  #   if(ref $file ne 'Path::Tiny' ) {
-  #   $file = path($file)->absolute;
-
-  #   my $filePath = $file->stringify;
-  # }
-
-  # if (!$file->is_file) {
-  #   my $err = "$filePath does not exist for reading";
-  #   $self->log($errCode, $err);
-
-  #   return ($err, undef, undef);
-  # }
-
-  # TODO: FIX
-  # if($innerFile) {
-  #   my ($err, $compressed, $command) = $self->getInnerFileCommand($filePath, $innerFile, $errCode);
-
-  #   if($err) {
-  #     return ($err, undef, undef);
-  #   }
-
-  #   $err = $self->safeOpen(my $fh, '-|', $command, $errCode);
-
-  #   # If an innerFile is passed, we assume that $file is a path to a tarball
-
-  #   return ($err, $compressed, $fh);
-  # }
+  my $compressed = !!$outerCommand;
 
+  return ( $err, $compressed, $fh );
 }
 
 sub getRemoteProg {

diff --git a/perl/lib/Utils/DbSnp2FormatInfo.pm b/perl/lib/Utils/DbSnp2FormatInfo.pm
@@ -0,0 +1,171 @@
+#!/usr/bin/perl
+use 5.10.0;
+use strict;
+use warnings;
+use DDP;
+
+# Take a DbSNP 2 VCF file, and for each row, split the INFO field's FREQ data into separate INFO fields for each population
+package Utils::DbSnp2FormatInfo;
+
+our $VERSION = '0.001';
+
+use File::Basename qw/basename/;
+
+use Mouse 2;
+use namespace::autoclean;
+use Path::Tiny qw/path/;
+
+use Seq::Tracks::Build::LocalFilesPaths;
+
+# Exports: _localFilesDir, _decodedConfig, compress, _wantedTrack, _setConfig, logPath, use_absolute_path
+extends 'Utils::Base';
+
+sub BUILD {
+  my $self = shift;
+
+  my $localFilesHandler = Seq::Tracks::Build::LocalFilesPaths->new();
+
+  my $localFilesAref = $localFilesHandler->makeAbsolutePaths(
+    $self->_decodedConfig->{files_dir},
+    $self->_wantedTrack->{name},
+    $self->_wantedTrack->{local_files}
+  );
+
+  $self->{_localFiles} = $localFilesAref;
+}
+
+# TODO: error check opening of file handles, write tests
+sub go {
+  my $self = shift;
+
+  my @output_paths;
+
+  for my $input_vcf ( @{ $self->{_localFiles} } ) {
+    if ( !-e $input_vcf ) {
+      $self->log( 'fatal', "input file path $input_vcf doesn't exist" );
+      return;
+    }
+
+    my ( $err, $isCompressed, $in_fh ) = $self->getReadFh($input_vcf);
+
+    $isCompressed ||= $self->compress;
+
+    if ($err) {
+      $self->log( 'fatal', $err );
+      return;
+    }
+
+    my $base_name = basename($input_vcf);
+    $base_name =~ s/\.[^.]+$//; # Remove last file extension (if present)
+    $base_name
+      =~ s/\.[^.]+$//; # Remove another file extension if it's something like .vcf.gz
+
+    my $output_vcf_data = $base_name . "_vcf_data.vcf" . ( $isCompressed ? ".gz" : "" );
+    my $output_vcf_header =
+      $base_name . "_vcf_header.vcf" . ( $isCompressed ? ".gz" : "" );
+    my $output_vcf = $base_name . "_processed.vcf" . ( $isCompressed ? ".gz" : "" );
+
+    $self->log( 'info', "Reading $input_vcf" );
+
+    my $output_header_path =
+      path( $self->_localFilesDir )->child($output_vcf_header)->stringify();
+    my $output_data_path =
+      path( $self->_localFilesDir )->child($output_vcf_data)->stringify();
+    my $output_path = path( $self->_localFilesDir )->child($output_vcf)->stringify();
+
+    if ( ( -e $output_data_path || -e $output_header_path || -e $output_path )
+      && !$self->overwrite )
+    {
+      $self->log( 'fatal',
+        "Temp files $output_data_path, $output_header_path, or final output path $output_path exist, and overwrite is not set"
+      );
+      return;
+    }
+
+    my $output_data_fh = $self->getWriteFh($output_data_path);
+
+    $self->log( 'info', "Writing to $output_data_path" );
+
+    # Store populations seen across the VCF
+    my %populations;
+
+    my @header_lines;
+    while (<$in_fh>) {
+      chomp;
+
+      # If it's a header line
+      if (/^#/) {
+        push @header_lines, $_;
+        next;
+      }
+
+      # If it's an INFO line
+      if (/FREQ=/) {
+        my @info_fields = split( /;/, $_ );
+        my @new_info_fields;
+        my %freqs;
+
+        foreach my $info (@info_fields) {
+          if ( $info =~ /FREQ=(.+)/ ) {
+            my $freq_data = $1;
+            my @pops      = split( /\|/, $freq_data );
+
+            foreach my $pop (@pops) {
+              if ( $pop =~ /([^:]+):(.+)/ ) {
+                my $pop_name  = $1;
+                my @freq_vals = split( /,/, $2 );
+                shift @freq_vals; # Remove the reference allele freq
+                $freqs{$pop_name}       = join( ",", @freq_vals );
+                $populations{$pop_name} = 1;
+              }
+            }
+          }
+          else {
+            push @new_info_fields, $info; # Keep the existing INFO fields
+          }
+        }
+
+        # Append the new frequency data to the INFO field
+        foreach my $pop_name ( keys %freqs ) {
+          push @new_info_fields, "$pop_name=$freqs{$pop_name}";
+        }
+
+        say $output_data_fh join( ";", @new_info_fields );
+      }
+    }
+
+    close($in_fh);
+    close($output_data_fh);
+
+    # Update the VCF header with new populations
+    my @pop_lines;
+    foreach my $pop ( keys %populations ) {
+      push @pop_lines,
+        "##INFO=<ID=$pop,Number=A,Type=Float,Description=\"Frequency for $pop\">";
+    }
+
+    splice( @header_lines, -1, 0, @pop_lines );
+
+    my $header_fh = $self->getWriteFh($output_header_path);
+
+    # Write the updated header and VCF to output
+    say $header_fh join( "\n", @header_lines );
+    close($header_fh);
+
+    system("cat $output_header_path $output_data_path > $output_path") == 0
+      or die "Failed to concatenate files: $?";
+    system("rm $output_header_path $output_data_path") == 0
+      or die "Failed to remove temporary files: $?";
+
+    $self->log( 'info', "$input_vcf processing complete" );
+
+    push @output_paths, $output_path;
+  }
+
+  $self->_wantedTrack->{local_files} = \@output_paths;
+
+  $self->_backupAndWriteConfig();
+}
+
+__PACKAGE__->meta->make_immutable;
+1;
diff --git a/perl/lib/Utils/LiftOverCadd.pm b/perl/lib/Utils/LiftOverCadd.pm
@@ -76,7 +76,7 @@ sub go {
   for my $inPath (@$localFilesPathsAref) {
     $self->log( 'info', "Beginning to lift over $inPath" );
 
-    my ( undef, $isCompressed, $inFh ) = $self->getReadFh( $inPath, undef, 'fatal' );
+    my ( undef, $isCompressed, $inFh ) = $self->getReadFh( $inPath, 'fatal' );
 
     my $baseName = path($inPath)->basename;
 

diff --git a/perl/lib/Utils/SortCadd.pm b/perl/lib/Utils/SortCadd.pm
@@ -64,7 +64,7 @@ sub go {
     $self->log( 'info', "Out path base: $outPathBase" );
     $self->log( 'info', "Reading input file: $inFilePath" );
 
-    my ( undef, $compressed, $readFh ) = $self->getReadFh( $inFilePath, undef, 'fatal' );
+    my ( undef, $compressed, $readFh ) = $self->getReadFh( $inFilePath, 'fatal' );
 
     my $versionLine = <$readFh>;
     my $headerLine  = <$readFh>;
@@ -155,7 +155,7 @@ sub go {
   for my $outPath (@outPaths) {
     my $gzipPath = $self->gzip;
 
-    my ( undef, $compressed, $fh ) = $self->getReadFh( $outPath, undef, 'fatal' );
+    my ( undef, $compressed, $fh ) = $self->getReadFh( $outPath, 'fatal' );
 
     my $outExt = '.sorted' . $outExtPart;
 

diff --git a/perl/t/utils/dbsnp2FormatInfo.t b/perl/t/utils/dbsnp2FormatInfo.t
@@ -0,0 +1,115 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+use Test::More;
+
+use Path::Tiny;
+use YAML::XS qw/DumpFile/;
+
+use Utils::DbSnp2FormatInfo;
+
+# create temp directories
+my $db_dir  = Path::Tiny->tempdir();
+my $raw_dir = Path::Tiny->tempdir();
+
+my $vcf_path = $raw_dir->child('test.vcf')->stringify;
+my $expected_output_vcf_path =
+  $raw_dir->child('dbSNP/test_processed.vcf')->stringify;
+
+my $config = {
+  'assembly'     => 'hg38',
+  'chromosomes'  => ['chr1'],
+  'database_dir' => $db_dir->stringify,
+  'files_dir'    => $raw_dir->stringify,
+  'tracks'       => {
+    'tracks' => [
+      {
+        'local_files' => [$vcf_path],
+        'name'        => 'dbSNP',
+        'sorted'      => 1,
+        'type'        => 'vcf',
+        'utils'       => [ { 'name' => 'DbSnp2FormatInfo' } ]
+      }
+    ]
+  }
+};
+
+# write temporary config file
+my $config_file = $raw_dir->child('filterCadd.yml');
+DumpFile( $config_file, $config );
+
+# Prepare a sample VCF for testing
+my $vcf_data = <<'END_VCF';
+##fileformat=VCFv4.1
+##INFO=<ID=RS,Number=1,Type=String,Description="dbSNP ID">
+##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description="dbSNP Build ID">
+##INFO=<ID=SSR,Number=0,Type=Flag,Description="Variant is a short tandem repeat">
+#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
+NC_000001.11    10001   rs1570391677    T       A,C     .       .       RS=1570391677;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;FREQ=KOREAN:0.9891,0.0109,.|SGDP_PRJ:0,1,.|dbGaP_PopFreq:1,.,0
+NC_000001.11    10002   rs1570391692    A       C       .       .       RS=1570391692;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;FREQ=KOREAN:0.9944,0.005597
+END_VCF
+
+# Write sample VCF to a temporary file
+open my $fh, '>', $vcf_path or die "Could not open $vcf_path: $!";
+print $fh $vcf_data;
+close $fh;
+
+# Initialize the utility and process the VCF
+my $utility = Utils::DbSnp2FormatInfo->new(
+  {
+    config   => $config_file,
+    name     => 'dbSNP',
+    utilName => 'DbSnp2FormatInfo'
+  }
+);
+
+$utility->go($vcf_path);
+
+# Check that the processed file exists and is correctly formatted
+ok( -e $expected_output_vcf_path, "Processed VCF file exists" );
+
+# Read the processed file to check the INFO field
+$fh = path($expected_output_vcf_path)->openr;
+
+ok( <$fh> == "##fileformat=VCFv4.1", 'VCF fileformat is correctly processed' );
+ok( <$fh> == "##INFO=<ID=RS,Number=1,Type=String,Description=\"dbSNP ID\">",
+  'RS population is correctly processed' );
+ok(
+  <$fh>
+    == "##INFO=<ID=dbSNPBuildID,Number=1,Type=Integer,Description=\"dbSNP Build ID\">",
+  'dbSNPBuildID population is correctly processed'
+);
+ok(
+  <$fh>
+    == "##INFO=<ID=SSR,Number=0,Type=Flag,Description=\"Variant is a short tandem repeat\">",
+  'SSR population is correctly processed'
+);
+ok(
+  <$fh>
+    == "##INFO=<ID=KOREAN,Number=A,Type=Float,Description=\"Frequency for KOREAN\">",
+  'KOREAN population is correctly processed'
+);
+ok(
+  <$fh>
+    == "##INFO=<ID=SGDP_PRJ,Number=A,Type=Float,Description=\"Frequency for SGDP_PRJ\">",
+  'SGDP_PRJ population is correctly processed'
+);
+ok(
+  <$fh>
+    == "##INFO=<ID=dbGaP_PopFreq,Number=A,Type=Float,Description=\"Frequency for dbGaP_PopFreq\">",
+  'dbGaP_PopFreq population is correctly processed'
+);
+ok( <$fh> == "#CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO" );
+
+ok(
+  <$fh>
+    == "NC_000001.11    10001   rs1570391677    T       A,C     .       .       RS=1570391677;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;KOREAN=0.0109,.;SGDP_PRJ=0,.;dbGaP_PopFreq=.,0",
+  '1st data row wiht KOREAN, SGDP_PRJ, dbGap freqs are correctly processed'
+);
+ok(
+  <$fh>
+    == "NC_000001.11    10002   rs1570391692    A       C       .       .       RS=1570391692;dbSNPBuildID=154;SSR=0;PSEUDOGENEINFO=DDX11L1:100287102;VC=SNV;R5;GNO;KOREAN=0.005597",
+  '2nd data row with KOREAN freq is correctly processed'
+);
+
+done_testing();