diff --git a/main.c b/main.c index 3c625760..84d31297 100644 --- a/main.c +++ b/main.c @@ -7,7 +7,7 @@ #include "mmpriv.h" #include "ketopt.h" -#define MM_VERSION "2.22-r1101" +#define MM_VERSION "2.22-r1105-dirty" #ifdef __linux__ #include @@ -74,6 +74,7 @@ static ko_longopt_t long_options[] = { { "rmq", ko_optional_argument, 347 }, { "qstrand", ko_no_argument, 348 }, { "cap-kalloc", ko_required_argument, 349 }, + { "q-occ-frac", ko_required_argument, 350 }, { "help", ko_no_argument, 'h' }, { "max-intron-len", ko_required_argument, 'G' }, { "version", ko_no_argument, 'V' }, @@ -229,6 +230,7 @@ int main(int argc, char *argv[]) else if (c == 346) opt.mask_len = mm_parse_num(o.arg); // --mask-len else if (c == 348) opt.flag |= MM_F_QSTRAND | MM_F_NO_INV; // --qstrand else if (c == 349) opt.cap_kalloc = mm_parse_num(o.arg); // --cap-kalloc + else if (c == 350) opt.q_occ_frac = atof(o.arg); // --q-occ-frac else if (c == 330) { fprintf(stderr, "[WARNING] \033[1;31m --lj-min-ratio has been deprecated.\033[0m\n"); } else if (c == 314) { // --frag diff --git a/map.c b/map.c index 9c4ad64a..7da9a49c 100644 --- a/map.c +++ b/map.c @@ -252,6 +252,7 @@ void mm_map_frag(const mm_idx_t *mi, int n_segs, const int *qlens, const char ** hash = __ac_Wang_hash(hash); collect_minimizers(b->km, opt, mi, n_segs, qlens, seqs, &mv); + if (opt->q_occ_frac > 0.0f) mm_seed_mz_flt(b->km, &mv, opt->mid_occ, opt->q_occ_frac); if (opt->flag & MM_F_HEAP_SORT) a = collect_seed_hits_heap(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); else a = collect_seed_hits(b->km, opt, opt->mid_occ, mi, qname, &mv, qlen_sum, &n_a, &rep_len, &n_mini_pos, &mini_pos); diff --git a/minimap.h b/minimap.h index a7a886bd..2204b6cd 100644 --- a/minimap.h +++ b/minimap.h @@ -163,6 +163,7 @@ typedef struct { int pe_ori, pe_bonus; float mid_occ_frac; // only used by mm_mapopt_update(); see below + float q_occ_frac; int32_t min_mid_occ, max_mid_occ; int32_t mid_occ; // ignore seeds with occurrences above this threshold int32_t max_occ, max_max_occ, occ_dist; diff --git a/minimap2.1 b/minimap2.1 index 86454716..8e57e446 100644 --- a/minimap2.1 +++ b/minimap2.1 @@ -151,10 +151,16 @@ Lower and upper bounds of k-mer occurrences [10,1000000]. The final k-mer occurr .BR -f }}. This option prevents excessively small or large .B -f -estimated from the input reference. It deprecates +estimated from the input reference. Available since r1034 and deprecating .B --min-occ-floor in earlier versions of minimap2. .TP +.BI --q-occ-frac \ FLOAT +Discard a query minimizer if its occurrence is higher than +.I FLOAT +fraction of query minimizers and than the reference occurrence threshold +[0.02]. Set 0 to disable. Available since r1105. +.TP .BI -e \ INT Sample a high-frequency minimizer every .I INT diff --git a/mmpriv.h b/mmpriv.h index 933e41f5..a2b5a80d 100644 --- a/mmpriv.h +++ b/mmpriv.h @@ -61,6 +61,7 @@ uint32_t ks_ksmall_uint32_t(size_t n, uint32_t arr[], size_t kk); void mm_sketch(void *km, const char *str, int len, int w, int k, uint32_t rid, int is_hpc, mm128_v *p); mm_seed_t *mm_collect_matches(void *km, int *_n_m, int qlen, int max_occ, int max_max_occ, int dist, const mm_idx_t *mi, const mm128_v *mv, int64_t *n_a, int *rep_len, int *n_mini_pos, uint64_t **mini_pos); +void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac); double mm_event_identity(const mm_reg1_t *r); int mm_write_sam_hdr(const mm_idx_t *mi, const char *rg, const char *ver, int argc, char *argv[]); diff --git a/options.c b/options.c index 97347be0..e62515d6 100644 --- a/options.c +++ b/options.c @@ -19,6 +19,7 @@ void mm_mapopt_init(mm_mapopt_t *opt) opt->min_mid_occ = 10; opt->max_mid_occ = 1000000; opt->sdust_thres = 0; // no SDUST masking + opt->q_occ_frac = 0.02f; opt->min_cnt = 3; opt->min_chain_score = 40; diff --git a/python/cmappy.pxd b/python/cmappy.pxd index b3016746..5eedb901 100644 --- a/python/cmappy.pxd +++ b/python/cmappy.pxd @@ -51,6 +51,7 @@ cdef extern from "minimap.h": int pe_ori, pe_bonus float mid_occ_frac + float q_occ_frac int32_t min_mid_occ int32_t mid_occ int32_t max_occ diff --git a/seed.c b/seed.c index baf1b208..08fe5ad8 100644 --- a/seed.c +++ b/seed.c @@ -2,6 +2,31 @@ #include "kalloc.h" #include "ksort.h" +void mm_seed_mz_flt(void *km, mm128_v *mv, int32_t q_occ_max, float q_occ_frac) +{ + mm128_t *a; + size_t i, j, st; + if (mv->n <= q_occ_max || q_occ_frac <= 0.0f || q_occ_max <= 0) return; + KMALLOC(km, a, mv->n); + for (i = 0; i < mv->n; ++i) + a[i].x = mv->a[i].x, a[i].y = i; + radix_sort_128x(a, a + mv->n); + for (st = 0, i = 1; i <= mv->n; ++i) { + if (i == mv->n || a[i].x != a[st].x) { + int32_t cnt = i - st; + if (cnt > q_occ_max && cnt > mv->n * q_occ_frac) + for (j = st; j < i; ++j) + mv->a[a[j].y].x = 0; + st = i; + } + } + kfree(km, a); + for (i = j = 0; i < mv->n; ++i) + if (mv->a[i].x != 0) + mv->a[j++] = mv->a[i]; + mv->n = j; +} + mm_seed_t *mm_seed_collect_all(void *km, const mm_idx_t *mi, const mm128_v *mv, int32_t *n_m_) { mm_seed_t *m;