Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

User specified splicing sites #197

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions align.c
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
if (splice_flag & MM_F_SPLICE_FOR) extra_flag |= rev? KSW_EZ_SPLICE_REV : KSW_EZ_SPLICE_FOR;
if (splice_flag & MM_F_SPLICE_REV) extra_flag |= rev? KSW_EZ_SPLICE_FOR : KSW_EZ_SPLICE_REV;
if (opt->flag & MM_F_SPLICE_FLANK) extra_flag |= KSW_EZ_SPLICE_FLANK;
if (mi->splices) extra_flag |= KSW_EZ_SPLICE_TAGS;
}

/* Look for the start and end of regions to perform DP. This sounds easy
Expand Down Expand Up @@ -666,7 +667,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int

if (qs > 0 && rs > 0) { // left extension
qseq = &qseq0[rev][qs0];
mm_idx_getseq(mi, rid, rs0, rs, tseq);
mm_idx_getseq_splicetags(mi, rid, rs0, rs, tseq);
mm_seq_rev(qs - qs0, qseq);
mm_seq_rev(rs - rs0, tseq);
mm_align_pair(km, opt, qs - qs0, qseq, rs - rs0, tseq, mat, bw, opt->end_bonus, r->split_inv? opt->zdrop_inv : opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY|KSW_EZ_RIGHT|KSW_EZ_REV_CIGAR, ez);
Expand Down Expand Up @@ -694,7 +695,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int
bw1 = qe - qs > re - rs? qe - qs : re - rs;
// perform alignment
qseq = &qseq0[rev][qs];
mm_idx_getseq(mi, rid, rs, re, tseq);
mm_idx_getseq_splicetags(mi, rid, rs, re, tseq);
if (is_sr) { // perform ungapped alignment
assert(qe - qs == re - rs);
ksw_reset_extz(ez);
Expand Down Expand Up @@ -733,7 +734,7 @@ static void mm_align1(void *km, const mm_mapopt_t *opt, const mm_idx_t *mi, int

if (!dropped && qe < qe0 && re < re0) { // right extension
qseq = &qseq0[rev][qe];
mm_idx_getseq(mi, rid, re, re0, tseq);
mm_idx_getseq_splicetags(mi, rid, re, re0, tseq);
mm_align_pair(km, opt, qe0 - qe, qseq, re0 - re, tseq, mat, bw, opt->end_bonus, opt->zdrop, extra_flag|KSW_EZ_EXTZ_ONLY, ez);
if (ez->n_cigar > 0) {
mm_append_cigar(r, ez->n_cigar, ez->cigar);
Expand Down
151 changes: 151 additions & 0 deletions index.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "mmpriv.h"
#include "kvec.h"
#include "khash.h"
#include "ksort.h"

#define idx_hash(a) ((a)>>1)
#define idx_eq(a, b) ((a)>>1 == (b)>>1)
Expand All @@ -31,6 +32,14 @@ typedef struct mm_idx_bucket_s {
void *h; // hash table indexing _p_ and minimizers appearing once
} mm_idx_bucket_t;

// Definitions for the arrays of user-specified splices sites:
typedef uint32_t splice_t;
typedef kvec_t(splice_t) splice_v;
#define SPLICE_DONOR (1U<<31) // MSB of splice position indicates if its a donor (1) or acceptor (0)
#define SPLICE_IS_DONOR(pos) (((pos) & SPLICE_DONOR) != 0) // True if the splice site is a donor
#define SPLICE_POS(pos) ((pos) & ~SPLICE_DONOR) // Splice site coordinate (untagged)
KRADIX_SORT_INIT(splice, splice_t, SPLICE_POS, sizeof(splice_t))

mm_idx_t *mm_idx_init(int w, int k, int b, int flag)
{
mm_idx_t *mi;
Expand All @@ -55,6 +64,11 @@ void mm_idx_destroy(mm_idx_t *mi)
kh_destroy(idx, (idxhash_t*)mi->B[i].h);
}
}
if (mi->splices) {
for(i = 0 ; i < mi->n_seq ; i++)
kv_destroy(((splice_v*) mi->splices)[i]);
kfree(mi->km, mi->splices);
}
if (!mi->km) {
for (i = 0; i < mi->n_seq; ++i)
free(mi->seq[i].name);
Expand Down Expand Up @@ -146,6 +160,27 @@ int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, ui
return en - st;
}

int mm_idx_getseq_splicetags(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq)
{
int res = mm_idx_getseq(mi, rid, st, en, seq);
if (mi->splices) {
const splice_v splices = ((const splice_v*) mi->splices)[rid];
// Bissects splices array for the left of the interval
int low = 0, high = kv_size(splices) - 1;
while(low <= high) {
size_t mid = (low + high) / 2;
if (SPLICE_POS(kv_A(splices, mid)) >= st)
high = mid - 1;
else
low = mid + 1;
}
const splice_t *p, *splices_end = &kv_A(splices, kv_size(splices));
for(p = &kv_A(splices, low) ; p < splices_end && SPLICE_POS(*p) < en; p++)
seq[SPLICE_POS(*p) - st] |= 1 << (4 + SPLICE_IS_DONOR(*p));
}
return res;
}

int32_t mm_idx_cal_max_occ(const mm_idx_t *mi, float f)
{
int i;
Expand Down Expand Up @@ -585,3 +620,119 @@ int mm_idx_reader_eof(const mm_idx_reader_t *r) // TODO: in extremely rare cases
{
return r->is_idx? (feof(r->fp.idx) || ftell(r->fp.idx) == r->idx_size) : mm_bseq_eof(r->fp.seq);
}

int mm_idx_splice_load(const char* fname, mm_idx_t* mi) {
unsigned i;

int type = 0; // 0=Unknown, 1=BED (introns), 2=TSV/CSV (unpaired sites)
const char* delim = " \t\r\n";
size_t fname_len = strlen(fname);
if (fname_len >= 4) {
if(strncasecmp(fname + fname_len - 4, ".bed", 4) == 0)
type = 1;
else if (strncasecmp(fname + fname_len - 4, ".tsv", 4) == 0)
type = 2;
else if (strncasecmp(fname + fname_len - 4, ".csv", 4) == 0) {
type = 2;
delim = ",\r\n";
}
}
if (!type) {
fprintf(stderr, "Unknown file type for splicing annotations: \"%s\"\n", fname);
return -1;
}

FILE* f = fopen(fname, "r");
if (f == NULL) {
perror("Could not open splice file");
return -1;
}

mm_idx_index_name(mi);
splice_v* splice_vectors = (splice_v*) kmalloc(mi->km, sizeof(splice_v) * mi->n_seq);
for(i = 0 ; i < mi->n_seq ; i++)
kv_init(splice_vectors[i]);
mi->splices = splice_vectors;

char line[256];
int linenb = 0;
if(type == 1) {
while (fgets(line, sizeof(line), f)) {
linenb++;
char* chr = strtok(line, delim);
if (chr == NULL || strcmp(chr, "track") == 0 || strcmp(chr, "browser") == 0)
continue;

// Corresponding sequence in the index:
int seq_id = mm_idx_name2id(mi, chr);
if (seq_id < 0) {
if (mm_verbose >= 2)
fprintf(stderr, "[WARNING] Name \"%s\" from introns annotations not found in reference database.\n", chr);
continue;
}
uint32_t seq_length = mi->seq[seq_id].len;

char* start_s = strtok(NULL, delim);
char* stop_s = strtok(NULL, delim);
if (start_s == NULL || stop_s == NULL) {
fprintf(stderr, "%s: malformed splice annotation at line %d\n", fname, linenb);
fclose(f);
return -1;
}

long donor = atol(start_s) - 1; // "Donor" site is before the first base of the intron
long acceptor = atol(stop_s) - 1; // "Acceptor" site is the last base of the intron (BED use past the end notation)
if(donor < -1 || donor >= seq_length || acceptor < 0 || acceptor >= seq_length) {
fprintf(stderr, "[WARNING] Ignored splicing site [%ld-%ld) out of range for reference len(%s)=%u.\n", donor+1, acceptor+1, chr, seq_length);
continue;
}

assert(donor < SPLICE_DONOR && acceptor < SPLICE_DONOR);
kv_push(splice_t, 0, splice_vectors[seq_id], donor | SPLICE_DONOR);
kv_push(splice_t, 0, splice_vectors[seq_id], acceptor);
}
} else if(type == 2) {
while (fgets(line, sizeof(line), f)) {
linenb++;
char* chr = strtok(line, delim);
if (chr == NULL)
continue;

int seq_id = mm_idx_name2id(mi, chr);
if (seq_id < 0) {
if (mm_verbose >= 2)
fprintf(stderr, "[WARNING] Name \"%s\" from introns annotations not found in reference database.\n", chr);
continue;
}
uint32_t seq_length = mi->seq[seq_id].len;

char* pos_s = strtok(NULL, delim);
char* kind_s = strtok(NULL, delim);
if (pos_s == NULL || kind_s == NULL || strlen(kind_s) != 1 || ((kind_s[0] | 0x20) != 'l' && (kind_s[0] | 0x20) != 'r') ) {
fprintf(stderr, "%s: malformed splice annotation at line %d\n", fname, linenb);
fclose(f);
return -1;
}

long pos = atol(pos_s);
if(pos < 0 || pos >= seq_length) {
fprintf(stderr, "[WARNING] Ignored splicing site %ld %s out of range for reference len(%s)=%u.\n", pos, kind_s, chr, seq_length);
continue;
}

assert(pos < SPLICE_DONOR);
pos |= (kind_s[0] | 0x20) == 'l' ? SPLICE_DONOR : 0;
kv_push(splice_t, 0, splice_vectors[seq_id], pos);
}
}

// Sort splicing coordinates for each reference sequence
for (i = 0 ; i < mi->n_seq ; i++) {
splice_v splices = splice_vectors[i];
if (kv_size(splices))
radix_sort_splice(splices.a, splices.a + kv_size(splices));
}

fclose(f);
return 0;
}
1 change: 1 addition & 0 deletions ksw2.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#define KSW_EZ_SPLICE_FOR 0x100
#define KSW_EZ_SPLICE_REV 0x200
#define KSW_EZ_SPLICE_FLANK 0x400
#define KSW_EZ_SPLICE_TAGS 0x800

#ifdef __cplusplus
extern "C" {
Expand Down
53 changes: 35 additions & 18 deletions ksw2_exts2_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -107,26 +107,43 @@ void ksw_exts2_sse(void *km, int qlen, const uint8_t *query, int tlen, const uin
}

for (t = 0; t < qlen; ++t) qr[t] = query[qlen - 1 - t];
memcpy(sf, target, tlen);
if (flag & KSW_EZ_SPLICE_TAGS) // Remove tags on target
for (t = 0; t < tlen; ++t) sf[t] = target[t] & ((1<<4) - 1);
else
memcpy(sf, target, tlen);

// set the donor and acceptor arrays. TODO: this assumes 0/1/2/3 encoding!
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV)) {
int semi_cost = flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0; // GTr or yAG is worth 0.5 bit; see PMID:18688272
memset(donor, -noncan, tlen_ * 16);
for (t = 0; t < tlen - 4; ++t) {
int can_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG
if ((flag & KSW_EZ_SPLICE_FOR) && target[t+1] == 2 && target[t+2] == 3) can_type = 1; // GTr...
if ((flag & KSW_EZ_SPLICE_REV) && target[t+1] == 1 && target[t+2] == 3) can_type = 1; // CTr...
if (can_type && (target[t+3] == 0 || target[t+3] == 2)) can_type = 2;
if (can_type) ((int8_t*)donor)[t] = can_type == 2? 0 : semi_cost;
}
memset(acceptor, -noncan, tlen_ * 16);
for (t = 2; t < tlen; ++t) {
int can_type = 0;
if ((flag & KSW_EZ_SPLICE_FOR) && target[t-1] == 0 && target[t] == 2) can_type = 1; // ...yAG
if ((flag & KSW_EZ_SPLICE_REV) && target[t-1] == 0 && target[t] == 1) can_type = 1; // ...yAC
if (can_type && (target[t-2] == 1 || target[t-2] == 3)) can_type = 2;
if (can_type) ((int8_t*)acceptor)[t] = can_type == 2? 0 : semi_cost;
int8_t splice_scores[4] = { -noncan, flag&KSW_EZ_SPLICE_FLANK? -noncan/2 : 0, 0 , noncan/2 };
if (flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV|KSW_EZ_SPLICE_TAGS)) {
__m128i noncan_ = _mm_set1_epi8(-noncan);
for (t = 0; t < tlen_; ++t) donor[t] = noncan_;
for (t = 0; t < tlen_; ++t) acceptor[t] = noncan_;

int canon_detect = !!(flag & (KSW_EZ_SPLICE_FOR|KSW_EZ_SPLICE_REV));
const uint8_t external_base = flag & KSW_EZ_SPLICE_FOR ? 2 : 1; // Forward => G, reverse => C
const uint8_t external_base_comp = flag & KSW_EZ_SPLICE_FOR ? 1 : 2;
for (t = 0; t < tlen; ++t) {
int donor_type = 0; // type of canonical site: 0=none, 1=GT/AG only, 2=GTr/yAG, 3=user-specified
if (target[t] & (1<<5)) { // user supplied donor site
if(canon_detect && t < tlen - 3 && sf[t+1] == external_base_comp && sf[t+2] == 3)
// A canonical user site on the opposite strand to the one on which canonical sites are currently detected,
donor_type = 0; // so we disable it.
else
donor_type = 3;
} else if (canon_detect && t < tlen - 3 && sf[t+1] == external_base && sf[t+2] == 3) // Forward => GT, reverse => CT
donor_type = (sf[t+3] == 0 || sf[t+3] == 2) ? 2 : 1;

int acceptor_type = 0;
if (target[t] & (1<<4)) { // user supplied acceptor site
if(canon_detect && t >= 2 && sf[t-1] == 0 && sf[t] == external_base_comp)
acceptor_type = 0;
else
acceptor_type = 3;
} else if (canon_detect && t >= 2 && sf[t-1] == 0 && sf[t] == external_base) // Forward => AG, reverse => AC
acceptor_type = (sf[t-2] == 1 || sf[t-2] == 3) ? 2 : 1;

if (donor_type) ((int8_t*)donor)[t] = splice_scores[donor_type];
if (acceptor_type) ((int8_t*)acceptor)[t] = splice_scores[acceptor_type];
}
}

Expand Down
26 changes: 18 additions & 8 deletions main.c
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ int main(int argc, char *argv[])
mm_mapopt_t opt;
mm_idxopt_t ipt;
int i, c, n_threads = 3, n_parts, old_best_n = -1;
char *fnw = 0, *rg = 0, *s;
char *fnw = 0, *rg = 0, *splice_fname = 0, *s;
FILE *fp_help = stderr;
mm_idx_reader_t *idx_rdr;
mm_idx_t *mi;
Expand Down Expand Up @@ -228,13 +228,17 @@ int main(int argc, char *argv[])
else opt.mid_occ = (int)(x + .499);
if (*p == ',') opt.max_occ = (int)(strtod(p+1, &p) + .499);
} else if (c == 'u') {
if (*o.arg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV; // both strands
else if (*o.arg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV; // match GT-AG
else if (*o.arg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; // match CT-AC (reverse complement of GT-AG)
else if (*o.arg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); // don't try to match the GT-AG signal
else {
fprintf(stderr, "[ERROR]\033[1;31m unrecognized cDNA direction\033[0m\n");
return 1;
if(o.arg[1] == '\0') {
if (*o.arg == 'b') opt.flag |= MM_F_SPLICE_FOR|MM_F_SPLICE_REV; // both strands
else if (*o.arg == 'f') opt.flag |= MM_F_SPLICE_FOR, opt.flag &= ~MM_F_SPLICE_REV; // match GT-AG
else if (*o.arg == 'r') opt.flag |= MM_F_SPLICE_REV, opt.flag &= ~MM_F_SPLICE_FOR; // match CT-AC (reverse complement of GT-AG)
else if (*o.arg == 'n') opt.flag &= ~(MM_F_SPLICE_FOR|MM_F_SPLICE_REV); // don't try to match the GT-AG signal
else {
fprintf(stderr, "[ERROR]\033[1;31m unrecognized cDNA direction\033[0m\n");
return 1;
}
} else {
splice_fname = o.arg;
}
} else if (c == 'z') {
opt.zdrop = opt.zdrop_inv = strtol(o.arg, &s, 10);
Expand Down Expand Up @@ -337,6 +341,12 @@ int main(int argc, char *argv[])
mm_idx_reader_close(idx_rdr);
return 1;
}
if (splice_fname != 0 && mm_idx_splice_load(splice_fname, mi) < 0) {
fprintf(stderr, "[ERROR] Could not load the splices from \"%s\".\n", splice_fname);
mm_idx_destroy(mi);
mm_idx_reader_close(idx_rdr);
return 1;
}
if ((opt.flag & MM_F_OUT_SAM) && idx_rdr->n_parts == 1) {
if (mm_idx_reader_eof(idx_rdr)) {
mm_write_sam_hdr(mi, rg, MM_VERSION, argc, argv);
Expand Down
12 changes: 11 additions & 1 deletion minimap.h
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ typedef struct {
mm_idx_seq_t *seq; // sequence name, length and offset
uint32_t *S; // 4-bit packed sequence
struct mm_idx_bucket_s *B; // index (hidden)
void *km, *h;
void *km, *h, *splices;
} mm_idx_t;

// minimap2 alignment
Expand Down Expand Up @@ -256,6 +256,15 @@ mm_idx_t *mm_idx_load(FILE *fp);
*/
void mm_idx_dump(FILE *fp, const mm_idx_t *mi);

/**
* Load splicing sites in the index
*
* @param fname file name.
*
* @return 0 in case of success
*/
int mm_idx_splice_load(const char* fname, mm_idx_t *mi);

/**
* Create an index from strings in memory
*
Expand Down Expand Up @@ -362,6 +371,7 @@ int mm_gen_MD(void *km, char **buf, int *max_len, const mm_idx_t *mi, const mm_r
int mm_idx_index_name(mm_idx_t *mi);
int mm_idx_name2id(const mm_idx_t *mi, const char *name);
int mm_idx_getseq(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);
int mm_idx_getseq_splicetags(const mm_idx_t *mi, uint32_t rid, uint32_t st, uint32_t en, uint8_t *seq);

// deprecated APIs for backward compatibility
void mm_mapopt_init(mm_mapopt_t *opt);
Expand Down
2 changes: 1 addition & 1 deletion options.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ void mm_mapopt_init(mm_mapopt_t *opt)

void mm_mapopt_update(mm_mapopt_t *opt, const mm_idx_t *mi)
{
if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV))
if ((opt->flag & MM_F_SPLICE_FOR) || (opt->flag & MM_F_SPLICE_REV) || (mi->splices != 0))
opt->flag |= MM_F_SPLICE;
if (opt->mid_occ <= 0)
opt->mid_occ = mm_idx_cal_max_occ(mi, opt->mid_occ_frac);
Expand Down