Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added clipping #21

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion samclip
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ use constant {
SAM_CIGAR => 5,
SAM_TLEN => 8,
SAM_SEQ => 9,
SAM_QUAL => 10,
};

#----------------------------------------------------------------------
Expand All @@ -41,6 +42,7 @@ my $ref = '';
my $invert = 0;
my $debug = 0;
my $progress = 100_000;
my $clip = 0;

#----------------------------------------------------------------------
sub usage {
Expand All @@ -57,7 +59,8 @@ sub usage {
" --help This help\n",
" --version Print version and exit\n",
" --ref FASTA Reference genome - needs FASTA.fai index\n",
" --max NUM Maximum clip length to allow (default=$max)\n",
" --clip Clip any soft ends; convert to hard clip\n",
" --max NUM Maximum clip length to allow before filtering (default=$max)\n",
" --invert Output rejected SAM lines and ignore good ones\n",
" --debug Print verbose debug info to stderr\n",
" --progress N Print progress every NUM records (default=$progress,none=0)\n",
Expand All @@ -75,6 +78,7 @@ GetOptions(
"help" => \&usage,
"version" => \&version,
"ref=s" => \$ref,
"clip" => \$clip,
"max=i" => \$max,
"invert" => \$invert,
"debug" => \$debug,
Expand Down Expand Up @@ -128,6 +132,11 @@ while (my $line = <ARGV>) {
$L = 0 if $start <= 1+$L;
$R = 0 if $end >= $contiglen-$R;
my $info = $debug ? "CHROM=$sam[SAM_RNAME]:1..$contiglen POS=$start..$end CIGAR=$sam[SAM_CIGAR] L=$L R=$R | HL=$HL SL=$SL SR=$SR HR=$HR max=$max)" : "need --debug";
if($clip){
my $samArr = clip(\@sam, $HL, $SL, $SR, $HR);
@sam = @$samArr;
$line = join("\t", @sam);
}
if ($L > $max or $R > $max) {
msg("BAD! $info") if $debug;
$removed++;
Expand All @@ -145,6 +154,47 @@ msg("Total SAM records $total, removed $removed, allowed $kept, passed", $total-
msg("Header contained $header lines");
msg("Done.");

sub clip{
my($samArr, $HL, $SL, $SR, $HR) = @_;

# Avoid modifying the original array
my @sam = @$samArr;

# Trim the sequence and qual according to soft clips
$sam[SAM_SEQ] = substr($sam[SAM_SEQ], $SL, length($sam[SAM_SEQ])-$SL-$SR);
$sam[SAM_QUAL] = substr($sam[SAM_QUAL], $SL, length($sam[SAM_QUAL])-$SL-$SR);

# update the cigar line: we have now hard clipped the ends
my @cigar;
while($sam[SAM_CIGAR] =~ /(\d+)([MIDNSHP=X])/g){
my($len, $op) = ($1, $2);
if($op eq 'S'){
$op = 'H';
}
push @cigar, [$len, $op];
}
# Merge neighboring hard clips
for(my $i=0; $i<@cigar; $i++){
if($cigar[$i][1] eq 'H' and $i+1 < @cigar and $cigar[$i+1][1] eq 'H'){
$cigar[$i][0] += $cigar[$i+1][0];
splice(@cigar, $i+1, 1);
$i--;
}
}
# Recreate the cigar string
$sam[SAM_CIGAR] = '';
for(@cigar){
my $op = $$_[1];
my $len = $$_[0];
$sam[SAM_CIGAR] .= $len.$op;
}

# update the position of the mapping according to how much we clipped from the left
$sam[SAM_POS] += $SL;

return \@sam;
}

#----------------------------------------------------------------------
sub fai_to_dict {
my($fname) = @_;
Expand Down