From 281ae6ba8957cbff77761d5e613397fca4a9f357 Mon Sep 17 00:00:00 2001 From: chhylp123 Date: Tue, 10 Oct 2023 12:40:29 -0400 Subject: [PATCH] add scaffolding --- CommandLines.cpp | 14 +- CommandLines.h | 5 +- Overlaps.cpp | 1599 ++++++++++++++++++++++++++++++++++++++++++---- Overlaps.h | 2 +- hic.cpp | 2 +- inter.cpp | 885 +++++++++++++++++++++++-- inter.h | 4 +- 7 files changed, 2338 insertions(+), 173 deletions(-) diff --git a/CommandLines.cpp b/CommandLines.cpp index 186b952..1bed13d 100644 --- a/CommandLines.cpp +++ b/CommandLines.cpp @@ -62,6 +62,8 @@ static ko_longopt_t long_options[] = { { "path-min", ko_required_argument, 347}, { "trio-dual", ko_no_argument, 348}, { "ul-cut", ko_required_argument, 349}, + { "dual-scaf", ko_no_argument, 350}, + { "scaf-gap", ko_required_argument, 351}, // { "path-round", ko_required_argument, 348}, { 0, 0, 0 } }; @@ -163,7 +165,7 @@ void Print_H(hifiasm_opt_t* asm_opt) fprintf(stderr, " --l-msjoin INT\n"); fprintf(stderr, " detect misjoined unitigs of >=INT in size; 0 to disable [%lu]\n", asm_opt->misjoin_len); - fprintf(stderr, " Ultra-Long-integration (beta):\n"); + fprintf(stderr, " Ultra-Long-integration:\n"); fprintf(stderr, " --ul FILEs file names of Ultra-Long reads [r1.fq,r2.fq,...]\n"); fprintf(stderr, " --ul-rate FLOAT\n"); fprintf(stderr, " error rate of Ultra-Long reads [%.3g]\n", asm_opt->ul_error_rate); @@ -179,6 +181,11 @@ void Print_H(hifiasm_opt_t* asm_opt) fprintf(stderr, " filter out ul_min_base); // fprintf(stderr, " --low-het enable it for genomes with very low het heterozygosity rate (<0.0001%%)\n"); + fprintf(stderr, " Dual-Scaffolding:\n"); + fprintf(stderr, " --dual-scaf output scaffolding\n"); + fprintf(stderr, " --scaf-gap INT\n"); + fprintf(stderr, " max gap size for scaffolding [%ld]\n", asm_opt->self_scaf_gap_max); + fprintf(stderr, "Example: ./hifiasm -o NA12878.asm -t 32 NA12878.fq.gz\n"); fprintf(stderr, "See `https://hifiasm.readthedocs.io/en/latest/' or `man ./hifiasm.1' for complete documentation.\n"); } @@ -295,6 +302,9 @@ void init_opt(hifiasm_opt_t* asm_opt) asm_opt->trio_cov_het_ovlp = -1; asm_opt->ul_min_base = 0; asm_opt->self_scaf = 0; + asm_opt->self_scaf_min = 250000; + asm_opt->self_scaf_reliable_min = 5000000; + asm_opt->self_scaf_gap_max = 3000000; } void destory_enzyme(enzyme* f) @@ -841,6 +851,8 @@ int CommandLine_process(int argc, char *argv[], hifiasm_opt_t* asm_opt) else if (c == 347) asm_opt->min_path_drop_rate = atof(opt.arg); else if (c == 348) asm_opt->trio_cov_het_ovlp = 1; else if (c == 349) asm_opt->ul_min_base = atol(opt.arg); + else if (c == 350) asm_opt->self_scaf = 1; + else if (c == 351) asm_opt->self_scaf_gap_max = atol(opt.arg); else if (c == 'l') { ///0: disable purge_dup; 1: purge containment; 2: purge overlap asm_opt->purge_level_primary = asm_opt->purge_level_trio = atoi(opt.arg); } diff --git a/CommandLines.h b/CommandLines.h index 1b4eb33..c66f190 100644 --- a/CommandLines.h +++ b/CommandLines.h @@ -5,7 +5,7 @@ #include #include -#define HA_VERSION "0.19.6-r597" +#define HA_VERSION "0.19.7-r598" #define VERBOSE 0 @@ -148,6 +148,9 @@ typedef struct { uint8_t hifi_pst_join, ul_pst_join; uint32_t ul_min_base; uint8_t self_scaf; + uint64_t self_scaf_min; + uint64_t self_scaf_reliable_min; + int64_t self_scaf_gap_max; } hifiasm_opt_t; extern hifiasm_opt_t asm_opt; diff --git a/Overlaps.cpp b/Overlaps.cpp index 9a455e7..3bd0420 100644 --- a/Overlaps.cpp +++ b/Overlaps.cpp @@ -99,7 +99,7 @@ typedef struct { // global data structure for kt_pipeline() kv_u_trans_t *res; kv_u_trans_t *ta; uint64_t n_thread, ov_cutoff; - double small_ov_rate, sec_rate; + double small_ov_rate, sec_rate, sc_sec_rate; asg64_v *srt; u_trans_cluster *cu; } u_trans_clean_t; @@ -153,6 +153,17 @@ typedef struct { size_t n, m; } kvect_N_t; +typedef struct { + uint64_t m, n, is_c; + uint64_t *a; +} sec_t; + +typedef struct { + sec_t *a; + size_t n, m; + ma_ug_t *ctg; +} kvect_sec_t; + ///this value has been updated at the first line of build_string_graph_without_clean long long min_thres; @@ -10552,12 +10563,12 @@ kvec_asg_arc_t_warp* edge, int max_hang, int min_ovlp, kvec_asg_arc_t_warp *E, u } return 0; } -uint32_t get_ug_coverage(ma_utg_t* u, asg_t* read_g, const ma_sub_t* coverage_cut, -ma_hit_t_alloc* sources, R_to_U* ruIndex, uint8_t* r_flag) +uint32_t get_ug_coverage(ma_utg_t* u, asg_t* read_g, const ma_sub_t* coverage_cut, ma_hit_t_alloc* sources, R_to_U* ruIndex, uint8_t* r_flag, uint64_t *rR, uint64_t *rC) { uint32_t k, j, rId, tn, is_Unitig; long long R_bases = 0, C_bases = 0; ma_hit_t *h; + if(rR) (*rR) = 0; if(rC) (*rC) = 0; if(u->m == 0) return 0; for (k = 0; k < u->n; k++) @@ -10597,6 +10608,7 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex, uint8_t* r_flag) r_flag[rId] = 0; } + if(rR) (*rR) = R_bases; if(rC) (*rC) = C_bases; return R_bases == 0? 0:C_bases/R_bases; } @@ -10721,9 +10733,9 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex, int print_seq, const char* prefix, FIL if(p->m == 0) continue; sprintf(name, "%s%.6d%c", prefix, i + 1, "lc"[p->circ]); if (print_seq) fprintf(fp, "S\t%s\t%s\tLN:i:%d\trd:i:%u\n", name, p->s? p->s : "*", p->len, - pc?get_ug_coverage(p, read_g, coverage_cut, sources, ruIndex, primary_flag):0); + pc?get_ug_coverage(p, read_g, coverage_cut, sources, ruIndex, primary_flag, NULL, NULL):0); else fprintf(fp, "S\t%s\t*\tLN:i:%d\trd:i:%u\n", name, p->len, - pc?get_ug_coverage(p, read_g, coverage_cut, sources, ruIndex, primary_flag):0); + pc?get_ug_coverage(p, read_g, coverage_cut, sources, ruIndex, primary_flag, NULL, NULL):0); for (j = l = 0; j < p->n; j++) { if(p->a[j] != (uint64_t)-1) @@ -10805,6 +10817,82 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex, const char* prefix, FILE *fp) ma_ug_print2(ug, &R_INF, read_g, coverage_cut, sources, ruIndex, 1, prefix, fp); } +void ma_scg_print(const kvect_sec_t *sc, All_reads *RNF, asg_t* read_g, const ma_sub_t *coverage_cut, ma_hit_t_alloc* sources, R_to_U* ruIndex, int print_seq, const char* prefix, FILE *fp) +{ + uint8_t* primary_flag = read_g?(uint8_t*)calloc(read_g->n_seq, sizeof(uint8_t)):NULL; + uint64_t i, j, l, pc = read_g && coverage_cut && sources && ruIndex?1:0, tl, m, x; sec_t *scp; + char name[32]; uint64_t Rb, Cb, tRb, tCb, Cov; ma_utg_t *z = NULL; uint8_t c; + for (i = 0; i < sc->n; ++i) { // the Segment lines in GFA + scp = &(sc->a[i]); + for (j = tl = tRb = tCb = 0; j < scp->n; j++) { + z = &(sc->ctg->u.a[((uint32_t)scp->a[j])>>1]); tl += z->len; tl += (scp->a[j]>>32); + if(pc) { + get_ug_coverage(z, read_g, coverage_cut, sources, ruIndex, primary_flag, &Rb, &Cb); tRb += Rb; tCb += Cb; + } + } + sprintf(name, "%s%.6lu%c", prefix, i + 1, "lc"[scp->is_c]); Cov = ((tRb==0)?(0):(tCb/tRb)); + + if (!print_seq) { + fprintf(fp, "S\t%s\t*\tLN:i:%lu\trd:i:%lu\n", name, tl, Cov); + } else { + fprintf(fp, "S\t%s\t", name); + for (j = 0; j < scp->n; j++) { + z = &(sc->ctg->u.a[((uint32_t)scp->a[j])>>1]); + if(!(((uint32_t)scp->a[j])&1)) { + fprintf(fp, "%s", z->s); + } else { + for(l = 0; l < z->len; l++) { + c = (uint8_t)(z->s[z->len-1-l]); + fprintf(fp, "%c", ((c>=128)?'N':comp_tab[c])); + } + } + for(l = 0; l < (scp->a[j]>>32); l++) fprintf(fp, "N"); + } + fprintf(fp, "\tLN:i:%lu\trd:i:%lu\n", tl, Cov); + } + + + for (j = tl = 0; j < scp->n; j++) { + z = &(sc->ctg->u.a[((uint32_t)scp->a[j])>>1]); l = tl; + if(!(((uint32_t)scp->a[j])&1)) { + for (m = 0; m < z->n; m++) { + x = z->a[m]>>33; + if(xtotal_reads) { + fprintf(fp, "A\t%s\t%lu\t%c\t%.*s\t%d\t%d\tid:i:%lu\tHG:A:%c\n", name, l, "+-"[(z->a[m]>>32)&1], + (int)Get_NAME_LENGTH((*RNF), x), Get_NAME((*RNF), x), + coverage_cut?coverage_cut[x].s:0, coverage_cut?coverage_cut[x].e:(int)Get_READ_LENGTH((*RNF), x), x, + "apmaaa"[((RNF->trio_flag[x]!=FATHER && RNF->trio_flag[x]!=MOTHER)?AMBIGU:RNF->trio_flag[x])]); + } else { + fprintf(fp, "A\t%s\t%lu\t%c\t%s\t%d\t%d\tid:i:%lu\tHG:A:%c\n", name, l, "+-"[(z->a[m]>>32)&1], + "FAKE", coverage_cut?coverage_cut[x].s:0, coverage_cut?coverage_cut[x].e:(int)Get_READ_LENGTH((*RNF), x), x, '*'); + } + l += (uint32_t)(z->a[m]); + } + } else { + for (m = 0; m < z->n; m++) { + x = z->a[z->n-m-1]>>33; + if(xtotal_reads) { + fprintf(fp, "A\t%s\t%lu\t%c\t%.*s\t%d\t%d\tid:i:%lu\tHG:A:%c\n", name, l, "+-"[((z->a[z->n-m-1]>>32)&1)^1], + (int)Get_NAME_LENGTH((*RNF), x), Get_NAME((*RNF), x), + coverage_cut?coverage_cut[x].s:0, coverage_cut?coverage_cut[x].e:(int)Get_READ_LENGTH((*RNF), x), x, + "apmaaa"[((RNF->trio_flag[x]!=FATHER && RNF->trio_flag[x]!=MOTHER)?AMBIGU:RNF->trio_flag[x])]); + } else { + fprintf(fp, "A\t%s\t%lu\t%c\t%s\t%d\t%d\tid:i:%lu\tHG:A:%c\n", name, l, "+-"[((z->a[z->n-m-1]>>32)&1)^1], + "FAKE", coverage_cut?coverage_cut[x].s:0, coverage_cut?coverage_cut[x].e:(int)Get_READ_LENGTH((*RNF), x), x, '*'); + } + l += (uint32_t)(z->a[z->n-m-1]); + } + } + tl = l; + + if((scp->a[j]>>32) > 0) fprintf(fp, "A\t%s\t%lu\t+\tNs\t0\t%lu\tid:i:-1\tHG:A:a\n", name, l, (scp->a[j]>>32)); + + tl += (scp->a[j]>>32); + } + } + free(primary_flag); +} + int asg_cut_internal(asg_t *g, int max_ext) { asg64_v a = {0,0,0}; @@ -14926,7 +15014,7 @@ static void worker_for_trans_sec_cut(void *data, long i, int tid) // callback fo mz = &(a[ss[z]]); if(mz->qs >= cz->qe) break; if(mz->occ == ((uint32_t)-1)) continue;///has been deleted - if((mz->f != RC_0) && (mz->f != RC_1) && (cz->nw > (mz->nw*0.95))) continue; + if((mz->f != RC_0) && (mz->f != RC_1) && (cz->nw > (mz->nw*s->sc_sec_rate/**0.95**/))) continue; os = MAX(cz->qs, mz->qs); oe = MIN(cz->qe, mz->qe); ovq = ((oe > os)? (oe - os):0); if(!ovq) continue; @@ -15199,7 +15287,7 @@ u_trans_cluster* gen_u_trans_cluster(kv_u_trans_t *ta) return p; } -void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g) +void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, double sc_sec_rate, uint64_t uniform_only) { u_trans_clean_t sl; uint64_t k, i, l, st, occ; ha_mzl_t *tz; ha_mzl_v srt_a; kv_u_trans_t *bl; u_trans_t *z; @@ -15208,7 +15296,7 @@ void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g kt_u_trans_t_idx(ta, ug->g->n_seq); sl.n_thread = asm_opt.thread_num; sl.ug = ug; sl.rg = read_g; sl.ta = ta; CALLOC(sl.res, sl.n_thread); - sl.ov_cutoff = 3; sl.small_ov_rate = 0.8; + sl.ov_cutoff = 3; sl.small_ov_rate = 0.8; sl.sc_sec_rate = sc_sec_rate; // dbg_prt_utg_trans(ta, ug, "pre"); kt_for(sl.n_thread, worker_for_trans_clean_re, &sl, sl.ta->idx.n); @@ -15252,10 +15340,11 @@ void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g free(sl.res); kv_destroy(srt_a); kt_u_trans_t_idx(ta, ug->g->n_seq); // dbg_prt_utg_trans(ta, ug, "after"); + if(uniform_only) return; - CALLOC(sl.srt, sl.n_thread); sl.sec_rate = 0.5; + CALLOC(sl.srt, sl.n_thread); sl.sec_rate = 0.5; sl.sc_sec_rate = sc_sec_rate; kt_for(sl.n_thread, worker_for_trans_sec_cut, &sl, sl.ta->idx.n); sl.cu = gen_u_trans_cluster(ta); kt_for(sl.n_thread, worker_for_trans_sec_cut, &sl, sl.cu->idx.n); @@ -15283,7 +15372,7 @@ void refine_hic_trans(ug_opt_t *opt, kv_u_trans_t *ta, asg_t *sg, ma_ug_t *ug) trans_base_infer(ug, sg, opt, ta, NULL); } // dbg_prt_utg_trans(ta, ug, "pre"); - clean_u_trans_t_idx_filter_adv(ta, ug, sg); + clean_u_trans_t_idx_filter_adv(ta, ug, sg, 0.95, 0); // dbg_prt_utg_trans(ta, ug, "after"); } @@ -17185,7 +17274,7 @@ void gen_bp_phasing(ug_opt_t *opt, kv_u_trans_t *ta, ma_ug_t *ug, asg_t *sg) trans_base_infer(ug, sg, opt, ta, bub); } // dbg_prt_utg_trans(ta, ug, "after"); - clean_u_trans_t_idx_filter_adv(ta, ug, sg); + clean_u_trans_t_idx_filter_adv(ta, ug, sg, 0.95, 0); bp_solve(opt, ta, ug, sg, bub, asm_opt.trans_base_rate); @@ -20836,6 +20925,22 @@ R_to_U* ruIndex, int max_hang, int min_ovlp, char *f_prefix) free(gfa_name); } +void output_hap_sc_graph(kvect_sec_t *ug, asg_t *sg, kvec_asg_arc_t_warp *arcs, ma_sub_t* coverage_cut, char* output_file_name, uint8_t flag, ma_hit_t_alloc* sources, R_to_U* ruIndex, int max_hang, int min_ovlp, char *f_prefix) +{ + char* gfa_name = (char*)malloc(strlen(output_file_name)+100); + sprintf(gfa_name, "%s.%s.p_ctg.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); + FILE* output_file = fopen(gfa_name, "w"); + + fprintf(stderr, "Writing %s to disk... \n", gfa_name); + + ma_ug_seq(ug->ctg, sg, coverage_cut, sources, arcs, max_hang, min_ovlp, 0, 1); + // ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, (flag==FATHER?"h1tg":"h2tg"), output_file); + ma_scg_print(ug, &R_INF, sg, coverage_cut, sources, ruIndex, 1, (flag==FATHER?"h1tg":"h2tg"), output_file); + fclose(output_file); + + free(gfa_name); +} + void filter_set_kug(uint8_t* trio_flag, asg_t *rg, uint8_t *rf, kvec_asg_arc_t_warp *r_edges, float f_rate, ma_ug_t **ug) { @@ -21136,12 +21241,13 @@ uint64_t append_miss_nid(asg_t *sg, ma_ug_t *hap0, ma_ug_t *hap1, uint8_t *ff, u return n_base; } -void prt_scaf_res_t(scaf_res_t *pa, ma_ug_t *ref, ma_ug_t *ctg) +void prt_scaf_res_t(scaf_res_t *pa, ma_ug_t *ref, ma_ug_t *ctg, const char *cn) { uint32_t k, i, z, a_n; ma_utg_t *rch; ul_vec_t *idx; uc_block_t *a; for(i = 0; i < ctg->u.n; i++) { rch = &(ctg->u.a[i]); idx = &(pa->a[i]); - fprintf(stderr, "[M::%s] rch->len::%u, rch->n::%u, idx->n::%u\n", __func__, (uint32_t)rch->len, (uint32_t)rch->n, (uint32_t)idx->bb.n); + fprintf(stderr, "[M::%s] %s%.6u%c. rch->len::%u, rch->n::%u, idx->n::%u\n", __func__, cn, i + 1, "lc"[rch->circ], (uint32_t)rch->len, (uint32_t)rch->n, (uint32_t)idx->bb.n); + if(idx->bb.n > 1) fprintf(stderr, "[M::%s::>>>>>>] idx->n::%u\n", __func__, (uint32_t)idx->bb.n); // for (k = 0; k < idx->bb.n; k++) { // fprintf(stderr, "[k->%u::utg%.6u%c(len->%u::n->%u)]\tq::[%u, %u)\t%c\tt::[%u, %u)\n", // k, (idx->bb.a[k].hid) + 1, "lc"[ref->u.a[(idx->bb.a[k].hid)].circ], ref->u.a[(idx->bb.a[k].hid)].len, ref->u.a[(idx->bb.a[k].hid)].n, @@ -21153,6 +21259,8 @@ void prt_scaf_res_t(scaf_res_t *pa, ma_ug_t *ref, ma_ug_t *ctg) fprintf(stderr, "q::[%u, %u)\n", idx->bb.a[k].qs, idx->bb.a[k].qe); for (z = 0; z < a_n; z++) fprintf(stderr, "utg%.6u%c,", (a[z].hid)+1, "lc"[ref->u.a[a[z].hid].circ]); fprintf(stderr, "\n"); + for (z = 0; z < a_n; z++) fprintf(stderr, "utg%.6u%c[%u,%u),", (a[z].hid)+1, "lc"[ref->u.a[a[z].hid].circ], a[z].qs, a[z].qe); + fprintf(stderr, "\n"); } } } @@ -21233,136 +21341,1411 @@ ma_ug_t *gen_clean_ug(ma_ug_t *ref, scaf_res_t *cp0, scaf_res_t *cp1) return ug; } -void double_scaffold(ma_ug_t *ref, ma_ug_t *hu0, ma_ug_t *hu1, scaf_res_t *cp0, scaf_res_t *cp1, asg_t *sg, ma_sub_t* cover, ma_hit_t_alloc* src, ma_hit_t_alloc* rev, -long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t, ug_opt_t *opt) +ma_ug_t *gen_shallow_clean_ug(ma_ug_t *ref, bubble_type *bu, scaf_res_t *cp0, scaf_res_t *cp1) { - kvec_asg_arc_t_warp new_rtg_edges; kv_init(new_rtg_edges.a); - ma_ug_t *cref = gen_clean_ug(ref, cp0, cp1); - asg_t *csg = copy_read_graph(sg); - hap_cov_t *cov = NULL; - - print_debug_gfa(sg, cref, cover, "cl.sb.utg", src, ruIndex, opt->max_hang, opt->min_ovlp, 0, 0, 0); + ma_ug_t *ug = copy_untig_graph(ref); + uint32_t i, k, a_n, z, v, w; scaf_res_t *ctg = NULL; ul_vec_t *idx; uc_block_t *a; + if(cp0 || cp1) { + ///just clean bubbles + for (i = 0; i < ug->g->n_seq; i++) { + if(i < bu->f_bub) ug->g->seq[i].del = 1;///no need broken bubble + } + for (i = 0; i < ug->g->n_arc; i++) { + if((bu->index[ug->g->arc[i].ul>>33] < bu->f_bub) || (bu->index[ug->g->arc[i].v>>1] < bu->f_bub)) {///no need broken bubble + ug->g->arc[i].del = 1; + } + } + + ctg = cp0; + if (ctg) { + for(i = 0; i < ctg->n; i++) { + idx = &(ctg->a[i]); + for (k = 0; k < idx->bb.n; k++) { + a = idx->bb.a + idx->bb.a[k].ts; a_n = idx->bb.a[k].te - idx->bb.a[k].ts; + for (z = 0; z < a_n; z++) ug->g->seq[a[z].hid].del = 0; + for (z = 1; z < a_n; z++) { + v = a[z-1].hid<<1; v |= (uint32_t)a[z-1].rev; + w = a[z].hid<<1; w |= (uint32_t)a[z].rev; + if(((v>>1) < bu->f_bub) || ((w>>1) < bu->f_bub)) { + asg_arc_del(ug->g, v, w, 0); + asg_arc_del(ug->g, w^1, v^1, 0); + } + } + } + } + } + + ctg = cp1; + if (ctg) { + for(i = 0; i < ctg->n; i++) { + idx = &(ctg->a[i]); + for (k = 0; k < idx->bb.n; k++) { + a = idx->bb.a + idx->bb.a[k].ts; a_n = idx->bb.a[k].te - idx->bb.a[k].ts; + for (z = 0; z < a_n; z++) ug->g->seq[a[z].hid].del = 0; + for (z = 1; z < a_n; z++) { + v = a[z-1].hid<<1; v |= (uint32_t)a[z-1].rev; + w = a[z].hid<<1; w |= (uint32_t)a[z].rev; + if(((v>>1) < bu->f_bub) || ((w>>1) < bu->f_bub)) { + asg_arc_del(ug->g, v, w, 0); + asg_arc_del(ug->g, w^1, v^1, 0); + } + } + } + } + } + } - new_rtg_edges.a.n = 0; - ///asm_opt.purge_overlap_len = asm_opt.purge_overlap_len_hic; - ///asm_opt.purge_simi_thres = asm_opt.purge_simi_rate_hic; - adjust_utg_by_primary(&cref, csg, TRIO_THRES, src, rev, cover, - tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, - max_hang, min_ovlp, &new_rtg_edges, &cov, b_mask_t, 1, 0); - - ma_ug_destroy(cref); - asg_destroy(csg); - // clean_u_trans_t_idx(&(cov->t_ch->k_trans), ug, sg); + for (i = 0; i < ug->g->n_seq; i++) { + if(ug->g->seq[i].del == 1) { + asg_seq_del(ug->g, i); + if(ug->u.a[i].m!=0) { + ug->u.a[i].m = ug->u.a[i].n = 0; + free(ug->u.a[i].a); + ug->u.a[i].a = NULL; + } + } + } - filter_u_trans(&(cov->t_ch->k_trans), asm_opt.is_bub_trans, asm_opt.is_topo_trans, asm_opt.is_read_trans, asm_opt.is_base_trans); - clean_u_trans_t_idx_filter_adv(&(cov->t_ch->k_trans), ref, sg); - // dbg_prt_utg_trans(&(cov->t_ch->k_trans), ref, "after"); + asg_cleanup(ug->g); + + return ug; +} + +static void worker_for_ctg_trans_cc(void *data, long i, int tid) // callback for kt_for() +{ + kv_u_trans_t *ta = (kv_u_trans_t*)data; + u_trans_t *a = NULL, *r_a = NULL, *mz, *cz; + uint32_t n, r_n, id = i, k, l, z; + + a = u_trans_a(*ta, id); n = u_trans_n(*ta, id); + for (k = 1, l = 0; k <= n; k++) { + if(k == n || a[l].tn != a[k].tn) { + if(k > l) { + get_u_trans_spec(ta, a[l].tn, a[l].qn, &r_a, &r_n); + assert(r_n > 0); + if(id > a[l].tn) { + l = k; continue; + } + + mz = cz = NULL; + for (z = l; z < k; z++) { + cz = &(a[z]); cz->del = 1; + if((mz) && (mz->f == 1) && (cz->f != 1)) continue; + if((!mz) || ((cz->f == 1) && (mz->f != 1)) || ((cz->f == mz->f) && ((mz->nw < cz->nw) || ((mz->nw == cz->nw) && ((mz->qe - mz->qs) < (cz->qe - cz->qs)))))) { + mz = cz; + } + } + for (z = 0; z < r_n; z++) { + cz = &(r_a[z]); cz->del = 1; + if((mz) && (mz->f == 1) && (cz->f != 1)) continue; + if((!mz) || ((cz->f == 1) && (mz->f != 1)) || ((cz->f == mz->f) && ((mz->nw < cz->nw) || ((mz->nw == cz->nw) && ((mz->qe - mz->qs) < (cz->qe - cz->qs)))))) { + mz = cz; + } + } - /**kv_u_trans_t *os =**/ gen_contig_trans(opt, sg, hu1, cp1, hu0, cp0, ref, &(cov->t_ch->k_trans)); + assert(mz); + mz->del = 0; + } + l = k; + } + } +} +void update_ctg_trans_cc(kv_u_trans_t *os) +{ + uint64_t k, m; u_trans_t *z; + kt_for(asm_opt.thread_num, worker_for_ctg_trans_cc, os, os->idx.n); + for (k = m = 0; k < os->n; k++) { + if(!(os->a[k].del)) os->a[m++] = os->a[k]; + } + os->n = m; + for (k = 0; k < m; k++) { + kv_pushp(u_trans_t, *os, &z); + *z = os->a[k]; + z->qn = os->a[k].tn; z->qs = os->a[k].ts; z->qe = os->a[k].te; + z->tn = os->a[k].qn; z->ts = os->a[k].qs; z->te = os->a[k].qe; + } + kt_u_trans_t_idx(os, os->idx.n); +} +uint32_t get_scaf_res_t(scaf_res_t *pa, uint32_t uid, uint32_t s, uint32_t e, uint32_t *in, uint32_t *rs, uint32_t *re, uint32_t *rn, uc_block_t **ra) +{ + // fprintf(stderr, "s::%u, e::%u, uid::%u, pa->n::%u\n", s, e, uid, (uint32_t)pa->n); + if(uid >= pa->n) return 0; + ul_vec_t *idx = &(pa->a[uid]); uint32_t k, z, a_n, os, oe, z0; uc_block_t *a; + (*rs) = (*re) = (uint32_t)-1; (*rn) = 0; (*ra) = NULL; + for (k = (*in); k < idx->bb.n; k++) { + a = idx->bb.a + idx->bb.a[k].ts; a_n = idx->bb.a[k].te - idx->bb.a[k].ts; + os = MAX(s, idx->bb.a[k].qs); + oe = MIN(e, idx->bb.a[k].qe); + // fprintf(stderr, "k::%u, os::%u, oe::%u, a_n::%u\n", k, os, oe, a_n); + if(os >= oe) continue; + for(z = 0; z < a_n && a[z].qe <= os; z++); + if(z >= a_n) continue; + for(z0 = z; z < a_n && a[z].qs < oe; z++); + if(z <= z0 ) continue; + *rs = os; *re = oe; *rn = z - z0; *ra = a + z0; (*in) = k + 1; + return 1; + } + return 0; } - -void gen_self_scaf(ug_opt_t *opt, ma_ug_t *hu0, ma_ug_t *hu1, asg_t *sg, ma_sub_t *cov, ma_hit_t_alloc *src, ma_hit_t_alloc *rev, R_to_U *ri, -long long tipsLen, float tip_drop_ratio, long long stops_threshold, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t) +void prt_scaf_res_t(ma_ug_t *ref, scaf_res_t *cp, ma_ug_t *ug, uint32_t uid, uint32_t s, uint32_t e, const char *pfx, uint32_t bw) { - /** - dedup_idx_t *hidx0 = NULL, *hidx1 = NULL, *uidx = NULL; uint8_t *ff = NULL; if(hu0 || hu1) CALLOC(ff, sg->n_seq); - ma_ug_t *ug = NULL; uint64_t pscut = 0; kvect_N_t Ns; kv_init(Ns); - // ug = ma_ug_gen(sg); - pscut = (asm_opt.hom_global_coverage_set?(asm_opt.hom_global_coverage):(((double)asm_opt.hom_global_coverage)/((double)HOM_PEAK_RATE))); - pscut *= PHASE_SEF; if(pscut < PHASE_SEP) pscut = PHASE_SEP; - ug = ma_ug_gen_phase(sg, pscut, PHASE_SEP_RATE); + uint32_t in, rs, re, rn, z; uc_block_t *ra; + in = 0; + while (get_scaf_res_t(cp, uid, s, e, &in, &rs, &re, &rn, &ra)) { + fprintf(stderr, "%s%.6u%c\t%u\t%u\t%u\tn(%u)\n", pfx, uid+1, "lc"[ug->u.a[uid].circ], ug->u.a[uid].len, rs, re, rn); + if(rn <= (bw<<1)) { + for (z = 0; z < rn; z++) fprintf(stderr, "utg%.6u%c,", (ra[z].hid)+1, "lc"[ref->u.a[ra[z].hid].circ]); + } else { + for (z = 0; z < bw; z++) fprintf(stderr, "utg%.6u%c,", (ra[z].hid)+1, "lc"[ref->u.a[ra[z].hid].circ]); + fprintf(stderr, "......,"); + for (z = rn-bw; z < rn; z++) fprintf(stderr, "utg%.6u%c,", (ra[z].hid)+1, "lc"[ref->u.a[ra[z].hid].circ]); + } + fprintf(stderr, "\n"); + } +} - uidx = gen_dedup_idx_t(ug, sg); if(hu0) hidx0 = gen_dedup_idx_t(hu0, sg); if(hu1) hidx1 = gen_dedup_idx_t(hu1, sg); - update_recover_atg_cov(); +void print_ctg_trans_ovlp(kv_u_trans_t *os, ma_ug_t *ref, ma_ug_t *hu1, scaf_res_t *cp1, ma_ug_t *hu2, scaf_res_t *cp2) +{ + u_trans_t *p = NULL; uint32_t i, qui, tui, qid, tid; ma_utg_t *uq, *ut; + for (i = 0; i < os->n; i++) { + p = &(os->a[i]); + qui = p->qnu.n?1:2; qid = ((p->qnu.n)?p->qn:p->qn-hu1->u.n); + tui = p->tnu.n?1:2; tid = ((p->tnu.n)?p->tn:p->tn-hu1->u.n); + uq = ((p->qnu.n)?&(hu1->u.a[p->qn]):&(hu2->u.a[p->qn-hu1->u.n])); + ut = ((p->tnu.n)?&(hu1->u.a[p->tn]):&(hu2->u.a[p->tn-hu1->u.n])); + fprintf(stderr, "************\n"); + fprintf(stderr, "[M::%s] h%utg%.6u%c\t%u\t%u\t%u\t%c\th%utg%.6u%c\t%u\t%u\t%u\tw(%f)\tf(%u)\n", __func__, + qui, qid+1, "lc"[uq->circ], uq->len, p->qs, p->qe, "+-"[p->rev], + tui, tid+1, "lc"[ut->circ], ut->len, p->ts, p->te, p->nw, p->f); - if(hidx0 && haploid_self_scaf(hidx0, uidx)); - if(hidx1); - if(hidx0 && hidx1); + prt_scaf_res_t(ref, qui==1?cp1:cp2, qui==1?hu1:hu2, (p->qnu.n)?p->qn:p->qn-hu1->u.n, p->qs, p->qe, qui==1?"h1tg":"h2tg", 3); + prt_scaf_res_t(ref, tui==1?cp1:cp2, tui==1?hu1:hu2, (p->tnu.n)?p->tn:p->tn-hu1->u.n, p->ts, p->te, tui==1?"h1tg":"h2tg", 3); + } +} +static inline void scaf_hit2arc(u_trans_t *a0, u_trans_t *a1, uint64_t rev, uint64_t el, int64_t glen, asg_arc_t *p) +{ + uint64_t v, w; + memset(p, 0, sizeof((*p))); + v = ((uint64_t)(a0->tn<<1))|((uint64_t)(!!(a0->rev))); v^=((uint64_t)(!!(rev))); + w = ((uint64_t)(a1->tn<<1))|((uint64_t)(!!(a1->rev))); w^=((uint64_t)(!!(rev))); + // p->ul = v; p->ul <<= 32; p->ul |= a0i; p->v = w; p->ol = a1i; p->el = el; + p->ul = v; p->ul <<= 32; p->v = w; p->ol = ((glen>=0)?(glen):(-glen)); p->el = el; p->strong = ((glen>=0)?(1):(0)); + // a0r = 0; if((a0->f) || (((a0->qe-a0->qs) >= reliable_len) && ((a0->te-a0->ts) >= reliable_len))) a0r = 1; + // a1r = 0; if((a1->f) || (((a1->qe-a1->qs) >= reliable_len) && ((a1->te-a1->ts) >= reliable_len))) a1r = 1; + // p->el = ((a0r && a1r)?1:0); +} - if(hidx0) destroy_dedup_idx_t(hidx0); if(hidx1) destroy_dedup_idx_t(hidx1); if(uidx) destroy_dedup_idx_t(uidx); - free(ff); ma_ug_destroy(ug); kv_destroy(Ns); - **/ - ma_ug_t *ug = NULL; - //uint64_t pscut = 0; - //pscut = (asm_opt.hom_global_coverage_set?(asm_opt.hom_global_coverage):(((double)asm_opt.hom_global_coverage)/((double)HOM_PEAK_RATE))); - //pscut *= PHASE_SEF; if(pscut < PHASE_SEP) pscut = PHASE_SEP; - //ug = ma_ug_gen_phase(sg, pscut, PHASE_SEP_RATE); - ug = ma_ug_gen(sg); +uint64_t is_noisy_ov(uint64_t s, uint64_t e, uint64_t len, uint64_t bd, double cov_rate, u_trans_t *a, uint64_t a_n) +{ + uint64_t w, k, zs, ze, oz, tot; + if(e - s < (bd<<1)) { + w = (e + s)>>1; + s = ((w>=bd)?(w-bd):(0)); + e = ((w+bd<=len)?(w+bd):(len)); + } + if(s >= e) return 0; + for(k = tot = 0; k < a_n; k++) { + if(a[k].qs >= e) break; + if(a[k].qe <= s) continue; + zs = MAX(s, a[k].qs); ze = MIN(e, a[k].qe); + oz = ((ze > zs)? (ze - zs):(0)); + tot += oz; + } + if((tot > 0) && (tot >= ((e - s)*cov_rate))) return 1; + return 0; +} - print_debug_gfa(sg, ug, cov, "sb.utg", src, ri, opt->max_hang, opt->min_ovlp, 0, 0, 0); - - fprintf(stderr, "[M::%s] hu0\n", __func__); - scaf_res_t *cp0 = gen_contig_path(opt, sg, hu0, ug); prt_scaf_res_t(cp0, ug, hu0); - fprintf(stderr, "[M::%s] hu1\n", __func__); - scaf_res_t *cp1 = gen_contig_path(opt, sg, hu1, ug); prt_scaf_res_t(cp1, ug, hu1); +uint64_t get_ref_nid(u_trans_t *z, ma_ug_t *hu1, ma_ug_t *hu2, scaf_res_t *cp1, scaf_res_t *cp2, uint64_t is_beg, uint64_t gap_bd) +{ + scaf_res_t *sc = NULL; uint64_t id, s_end, e_end; + if(z->tn < hu1->u.n) { + sc = cp1; id = z->tn; + } else { + sc = cp2; id = z->tn - hu1->u.n; + } + + if(is_beg) { + s_end = 0; e_end = 1; + } else { + s_end = 1; e_end = 0; + } - double_scaffold(ug, hu0, hu1, cp0, cp1, sg, cov, src, rev, tipsLen, tip_drop_ratio, stops_threshold, ri, chimeric_rate, drop_ratio, max_hang, min_ovlp, b_mask_t, opt); + if(!(z->rev)) { + s_end ^= 1; e_end ^= 1; + } - ma_ug_destroy(ug); destroy_scaf_res_t(cp0); destroy_scaf_res_t(cp1); + ul_vec_t *idx = &(sc->a[id]); uint64_t k, m, a_n, os, oe, m0; + uc_block_t *a, *am = NULL; uint64_t max_len = 0, cgap, clen; + + for (k = 0; k < idx->bb.n; k++) { + a = idx->bb.a + idx->bb.a[k].ts; a_n = idx->bb.a[k].te - idx->bb.a[k].ts; + os = MAX(z->ts, idx->bb.a[k].qs); + oe = MIN(z->te, idx->bb.a[k].qe); + if(os >= oe) continue; + for(m = 0; m < a_n && a[m].qe <= os; z++); + if(m >= a_n) continue; + for(m0 = m; m < a_n && a[m].qs < oe; m++); + if(m <= m0) continue; + + os = MAX(z->ts, a[m0].qs); + oe = MIN(z->te, a[m-1].qe); + if(oe <= os) continue; + clen = oe - os; cgap = 0; + if(s_end) { + cgap = os - z->ts; + } else if(e_end) { + cgap = z->te - oe; + } + if(cgap > gap_bd) continue; + if(clen > max_len) { + max_len = clen; am = ((s_end)?(&(a[m0])):(&(a[m-1]))); + } + } + + if(!am) return (uint64_t)-1; + return ((uint64_t)(am->hid<<1))|((uint64_t)(!!(am->rev))); } -void output_trio_graph_joint(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, -ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, -long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, -int min_ovlp, long long gap_fuzz, bub_label_t* b_mask_t, ma_ug_t **rhu0, ma_ug_t **rhu1, ug_opt_t *opt) +uint64_t cal_self_close_tang(asg_t *g, uint32_t s0, uint32_t e0, asg32_v *b1, asg32_v *b2, uint8_t *f, uint32_t max_path) { - ma_ug_t *hu0 = NULL, *hu1 = NULL; kvec_asg_arc_t_warp arcs0, arcs1; - memset(&arcs0, 0, sizeof(arcs0)); memset(&arcs1, 0, sizeof(arcs1)); + uint32_t z, v, nv, k, se = 0, o0, o1, o, tot; asg_arc_t *av; + b1->n = 0; b2->n = 0; tot = 0; + kv_push(uint32_t, *b1, (s0>>1)); + while(b1->n) { + z = b1->a[--b1->n]; + if(f[z]) continue; + f[z] = 1; o0 = o1 = 0; + kv_push(uint32_t, *b2, z); - reduce_hamming_error_adv(NULL, sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, ruIndex, NULL); - - hu0 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, - reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, - drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, &arcs0); - - hu1 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, - reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, - drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, &arcs1); + v = (z<<1); o = 0; + if((v != e0) && (v != (s0^1))) { + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if(f[av[k].v>>1]) { + if(((av[k].v>>1) != (s0>>1)) && ((av[k].v>>1) != (e0>>1)) && (o < av[k].ol)) o = av[k].ol; + continue; + } + kv_push(uint32_t, *b1, (av[k].v>>1)); + } + if(se) break; + } + o0 = o; - dedup_idx_t *hidx0 = NULL, *hidx1 = NULL; uint8_t *ff; CALLOC(ff, sg->n_seq); - hidx0 = gen_dedup_idx_t(hu0, sg); hidx1 = gen_dedup_idx_t(hu1, sg); - update_recover_atg_cov(); - uint64_t dedup_base = 0, miss_base = 0, s; - s = dedup_exact_ug(hidx1, hidx0, coverage_cut, sources, ruIndex, ff, FATHER); dedup_base += s; - s = dedup_exact_ug(hidx0, hidx1, coverage_cut, sources, ruIndex, ff, MOTHER); dedup_base += s; - destroy_dedup_idx_t(hidx0); destroy_dedup_idx_t(hidx1); + v = (z<<1) + 1; o = 0; + if((v != e0) && (v != (s0^1))) { + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if(f[av[k].v>>1]) { + if(((av[k].v>>1) != (s0>>1)) && ((av[k].v>>1) != (e0>>1)) && (o < av[k].ol)) o = av[k].ol; + continue; + } + kv_push(uint32_t, *b1, (av[k].v>>1)); + } + if(se) break; + } + o1 = o; - s = append_miss_nid(sg, hu0, hu1, ff, PHASE_MISS_LEN, PHASE_MISS_N); miss_base += s; - // s = append_miss_nid(sg, hu0, hu1, ff, PHASE_MISS_SLEN, PHASE_MISS_SN); miss_base += s; - free(ff); + if(o0 + o1 < g->seq[z].len) tot += g->seq[z].len - o0 - o1; + if(tot > max_path) break; + } - renew_utg((&hu0), sg, &arcs0); renew_utg((&hu1), sg, &arcs1); - fprintf(stderr, "[M::%s] dedup_base::%lu, miss_base::%lu\n", __func__, dedup_base, miss_base); + for (k = 0; k < b2->n; k++) f[b2->a[k]] = 0; + + if((!se) && (tot <= max_path)) return tot; + return (uint64_t)-1; +} - if(asm_opt.self_scaf) { - gen_self_scaf(opt, hu0, hu1, sg, coverage_cut, sources, reverse_sources, ruIndex, tipsLen, tip_drop_ratio, stops_threshold, chimeric_rate, drop_ratio, max_hang, min_ovlp, b_mask_t); +uint64_t get_ref_nid_flank(u_trans_t *z, ma_ug_t *ref, ma_ug_t *hu1, ma_ug_t *hu2, scaf_res_t *cp1, scaf_res_t *cp2, uint64_t is_beg, uint64_t gap_bd, uint64_t flank_len, asg32_v *res) +{ + scaf_res_t *sc = NULL; uint64_t id, s_end, e_end, rn = res->n; + if(z->tn < hu1->u.n) { + sc = cp1; id = z->tn; + } else { + sc = cp2; id = z->tn - hu1->u.n; } - if(!rhu0) { - output_hap_graph(hu0, sg, &arcs0, coverage_cut, output_file_name, FATHER, sources, ruIndex, max_hang, min_ovlp, NULL); - ma_ug_destroy(hu0); + if(is_beg) { + s_end = 0; e_end = 1; } else { - (*rhu0) = hu0; - } - kv_destroy(arcs0.a); + s_end = 1; e_end = 0; + } + + if(z->rev) { + s_end ^= 1; e_end ^= 1; + } + + ul_vec_t *idx = &(sc->a[id]); uint64_t k, m, a_n, os, oe, m0; + uc_block_t *a, *am = NULL; uint64_t max_len = 0, cgap, clen; int64_t ami, amn = 0, as, ae, amz; + // if(z->tn == 5) { + // fprintf(stderr, "\n[M::%s] h1tg%.6u%c(%c)\ts::%u\te::%u\ts_end::%lu\te_end::%lu\n", __func__, z->tn+1, "lc"[hu1->u.a[z->tn].circ], "+-"[z->rev], z->ts, z->te, s_end, e_end); + // } - if(!rhu1) { - output_hap_graph(hu1, sg, &arcs1, coverage_cut, output_file_name, MOTHER, sources, ruIndex, max_hang, min_ovlp, NULL); - ma_ug_destroy(hu1); - } else { - ma_ug_destroy(hu1); + for (k = 0; k < idx->bb.n; k++) { + a = idx->bb.a + idx->bb.a[k].ts; a_n = idx->bb.a[k].te - idx->bb.a[k].ts; + os = MAX(z->ts, idx->bb.a[k].qs); + oe = MIN(z->te, idx->bb.a[k].qe); + // if(z->tn == 5) { + // fprintf(stderr, "[M::%s] idx->bb.a[%lu]::[%u, %u), o[%lu, %lu), a_n::%lu\n", __func__, k, idx->bb.a[k].qs, idx->bb.a[k].qe, os, oe, a_n); + // } + if(os >= oe) continue; + for(m = 0; m < a_n && a[m].qe <= os; m++); + // { + // if(z->tn == 5) fprintf(stderr, "+(%lu)utg%.6u%c[%u,%u),", m, (a[m].hid)+1, "lc"[ref->u.a[a[m].hid].circ], a[m].qs, a[m].qe); + // } + // if(z->tn == 5) fprintf(stderr, "\n[M::%s] +m::%lu\n", __func__, m); + if(m >= a_n) continue; + for(m0 = m; m < a_n && a[m].qs < oe; m++); + // { + // if(z->tn == 5) fprintf(stderr, "-(%lu)utg%.6u%c[%u,%u),", m, (a[m].hid)+1, "lc"[ref->u.a[a[m].hid].circ], a[m].qs, a[m].qe); + // } + // if(z->tn == 5) fprintf(stderr, "\n[M::%s] -m::%lu\n", __func__, m); + if(m <= m0) continue; + + os = MAX(z->ts, a[m0].qs); + oe = MIN(z->te, a[m-1].qe); + // if(z->tn == 5) { + // fprintf(stderr, "[M::%s] m0::%lu, m::%lu, o[%lu, %lu), a_n::%lu\n", __func__, m0, m, os, oe, a_n); + // } + if(oe <= os) continue; + clen = oe - os; cgap = 0; + if(s_end) { + cgap = os - z->ts; + } else if(e_end) { + cgap = z->te - oe; + } + if(cgap > gap_bd) continue; + if(clen > max_len) { + max_len = clen; am = a + m0; amn = m - m0; + } + } + + // if(z->tn == 49 && z->ts == 0 && z->te == 2143149) { + // fprintf(stderr, "-0-[M::%s] h1tg%.6ul(%c)\tts::%u\tte::%u\tamn::%ld\ts_end::%lu\te_end::%lu\n", __func__, + // z->tn+1, "+-"[z->rev], z->ts, z->te, amn, s_end, e_end); + // } + + // if(z->tn == 5) fprintf(stderr, "[M::%s] amn::%ld\n", __func__, amn); + + if((!am) || (!amn)) return 0; + os = oe = 0; as = 0; ae = amn; + if(s_end) { + for(ami = 0; (ami < amn) && ((z->ts + flank_len) >= am[ami].qe); ami++); + if((ami < amn) && ((z->ts + flank_len) >= am[ami].qs)) ami++; + as = 0; ae = ami; + } else if(e_end) { + for(ami = amn - 1; (ami >= 0) && (z->te <= (am[ami].qs + flank_len)); ami--); + if((ami >= 0) && (z->te <= (am[ami].qe + flank_len))) ami--; + ami++; as = ami; ae = amn; + } + // if(z->tn == 49 && z->ts == 0 && z->te == 2143149) { + // fprintf(stderr, "-1-[M::%s] h1tg%.6ul(%c)\tts::%u\tte::%u\tas::%ld\tae::%ld\n", __func__, + // z->tn+1, "+-"[z->rev], z->ts, z->te, as, ae); + // } + // if(z->tn == 5) fprintf(stderr, "+[M::%s] as::%ld, ae::%ld\n", __func__, as, ae); + if(as >= ae) return 0; + + ///make sure there are no identical node + if(s_end) { + for (ami = as; ami < ae; ami++) { + for (amz = ami + 1; (amz < ae) && (am[ami].hid != am[amz].hid); amz++); + if(amz < ae) { + ae = ami + 1; break; + } + } + } else if(e_end) { + for (ami = ae - 1; ami >= as; ami--) { + for (amz = ami - 1; (amz >= as) && (am[ami].hid != am[amz].hid); amz--); + if(amz >= as) { + as = ami; break; + } + } + } + // if(z->tn == 49 && z->ts == 0 && z->te == 2143149) { + // fprintf(stderr, "-1-[M::%s] h1tg%.6ul(%c)\tts::%u\tte::%u\tas::%ld\tae::%ld\n", __func__, + // z->tn+1, "+-"[z->rev], z->ts, z->te, as, ae); + // } + // if(z->tn == 5) fprintf(stderr, "+[M::%s] as::%ld, ae::%ld\n", __func__, as, ae); + if(as >= ae) return 0; + + uint32_t *fp = NULL, r2n = 0; + kv_resize(uint32_t, *res, rn + ((ae - as)<<1)); + for (ami = as; ami < ae; ami++) { + m = ((uint64_t)(am[ami].hid<<1))|((uint64_t)(!!(am[ami].rev))); m ^= ((uint64_t)(!!(z->rev))); + kv_push(uint32_t, *res, m); + } + rn = res->n - rn; + if(z->rev) { + fp = res->a + res->n - rn; r2n = rn>>1; + for (k = 0; k < r2n; k++) { + m = fp[k]; fp[k] = fp[rn - k -1]; fp[rn - k -1] = m; + } + } + + if(s_end) { + for (ami = as; ami < ae; ami++) { + m = ((am[ami].qs>=z->ts)?(am[ami].qs-z->ts):(0)); + kv_push(uint32_t, *res, m); + } + } else if(e_end) { + for (ami = as; ami < ae; ami++) { + m = ((z->te>=am[ami].qe)?(z->te-am[ami].qe):(0)); + kv_push(uint32_t, *res, m); + } + } + if(z->rev) { + fp = res->a + res->n - rn; r2n = rn>>1; + for (k = 0; k < r2n; k++) { + m = fp[k]; fp[k] = fp[rn - k - 1]; fp[rn - k - 1] = m; + } + } + + return rn; + // return ((uint64_t)(am->hid<<1))|((uint64_t)(!!(am->rev))); +} + +void dedup_tang_flank(uint32_t *va, uint32_t *vd, uint32_t *vn0, uint32_t *wa, uint32_t *wd, uint32_t *wn0) +{ + uint64_t k, z, vn = (*vn0), wn = (*wn0); + uint32_t vp, wp; + for (k = 0; k < vn && k < wn; k++) { + if(k < vn) { + vp = va[vn-k-1]>>1; + for (z = k; (z < wn) && (vp != (wa[z]>>1)); z++); + if(z < wn) wn = z; + } + if(k < wn) { + wp = wa[k]>>1; + for (z = k; (z < vn) && (wp != (va[vn-z-1]>>1)); z++); + if(z < vn) vn = z; + } + } + if(vn < (*vn0)) { + z = (*vn0) - vn; + for (k = 0; k < vn; k++) { + va[k] = va[k + z]; vd[k] = vd[k + z]; + } + } + (*vn0) = vn; (*wn0) = wn; +} + + +uint64_t cal_self_close_tang_flank0(asg_t *g, uint32_t v0, uint32_t s0, uint32_t e0, uint64_t sf, uint64_t ef, asg32_v *b1, asg32_v *b2, uint8_t *f, uint8_t *fs, uint64_t max_path) +{ + uint64_t se = 0, k, v, o, o0, o1, nv, tot = 0; asg_arc_t *av; + + if(f[v0]) return (uint64_t)-1; + + // if((v0>>1) == 47687) fprintf(stderr, "+[M::%s] v0->utg%.6ul(%c), s0::%u, e0::%u, sf::%lu, ef::%lu\n", __func__, (v0>>1)+1, "+-"[v0&1], s0, e0, sf, ef); + + b1->n = b2->n = 0; kv_push(uint32_t, *b1, v0); + while(b1->n) { + v = b1->a[--b1->n]; + if(f[v] && f[v^1]) continue; + // if((v0>>1) == 47687) fprintf(stderr, "[M::%s] utg%.6lul(%c), tot::%lu\n", __func__, (v>>1)+1, "+-"[v&1], tot); + + v = v; + // if((v0>>1) == 47687) fprintf(stderr, "[M::%s] v::%lu, f[v]::%u, fs[v]::%u, fs[v^1]::%u\n", __func__, v, f[v], fs[v], fs[v^1]); + if((!f[v]) && (v!=(s0^1)) && (v!=e0)) { + if((fs[v^1] != sf) && (fs[v] != ef)) { + f[v] = 1; kv_push(uint32_t, *b2, v); + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = 0; k < nv; k++) { + if(av[k].del) continue; + // if((v0>>1) == 47687) fprintf(stderr, "[M::%s] k::%lu, f[av[k].v]::%u, fs[av[k].v]::%u, fs[av[k].v^1]::%u\n", __func__, k, f[av[k].v], fs[av[k].v], fs[av[k].v^1]); + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if(fs[av[k].v] == sf) { + fs[av[k].v] = 0;///not a beg node anymore + if(!f[av[k].v^1]) kv_push(uint32_t, *b1, (av[k].v^1));///push reverse node + } + if(fs[(av[k].v^1)] == ef) { + fs[(av[k].v^1)] = 0;///not a end node anymore; do not need to push as it will be pushed later + if(!f[av[k].v]) kv_push(uint32_t, *b1, (av[k].v));///push reverse node + } + + /**if((fs[av[k].v^1] != sf) && (fs[av[k].v] != ef))**/ { + if((!f[av[k].v])) kv_push(uint32_t, *b1, av[k].v); + } + } + if(se) break; + } + } + + v = v^1; + // if((v0>>1) == 47687) fprintf(stderr, "[M::%s] v::%lu, f[v]::%u, fs[v]::%u, fs[v^1]::%u\n", __func__, v, f[v], fs[v], fs[v^1]); + if((!f[v]) && (v != (s0^1)) && (v != e0)) { + if((fs[v^1] != sf) && (fs[v] != ef)) { + f[v] = 1; kv_push(uint32_t, *b2, v); + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = 0; k < nv; k++) { + if(av[k].del) continue; + // if((v0>>1) == 47687) fprintf(stderr, "[M::%s] k::%lu, f[av[k].v]::%u, fs[av[k].v]::%u, fs[av[k].v^1]::%u\n", __func__, k, f[av[k].v], fs[av[k].v], fs[av[k].v^1]); + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if(fs[av[k].v] == sf) { + fs[av[k].v] = 0;///not a beg node anymore + if(!f[av[k].v^1]) kv_push(uint32_t, *b1, (av[k].v^1));///push reverse node + } + if(fs[(av[k].v^1)] == ef) fs[(av[k].v^1)] = 0;///not a end node anymore; do not need to push as it will be pushed later + + /**if((fs[av[k].v^1] != sf) && (fs[av[k].v] != ef))**/ { + if((!f[av[k].v])) kv_push(uint32_t, *b1, av[k].v); + } + } + if(se) break; + } + } + + if(f[v] && f[v^1]) { + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((f[av[k].v^1]) && (f[av[k].v]) && (o < av[k].ol)) o = av[k].ol; + } + o0 = o; + + av = asg_arc_a(g, (v^1)); nv = asg_arc_n(g, (v^1)); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((f[av[k].v^1]) && (f[av[k].v]) && (o < av[k].ol)) o = av[k].ol; + } + o1 = o; + + if(o0 + o1 < g->seq[v>>1].len) tot += g->seq[v>>1].len - o0 - o1; + } + + if(tot > max_path) break; + } + + // if((v0>>1) == 47687) fprintf(stderr, "-[M::%s] v0->utg%.6ul(%c), tot::%lu, se::%lu\n", __func__, (v0>>1)+1, "+-"[v0&1], tot, se); + + if((!se) && (tot <= max_path)) return tot; + return (uint64_t)-1; +} + + + +uint64_t cal_self_close_tang_flank1(asg_t *g, uint32_t s0, uint32_t e0, asg32_v *b1, asg32_v *b2, uint8_t *f, uint8_t *fsk, uint64_t max_path) +{ + uint64_t se = 0, k, v, o, o0, o1, nv, tot = 0; asg_arc_t *av; + + b1->n = b2->n = 0; kv_push(uint32_t, *b1, s0); + while(b1->n) { + v = b1->a[--b1->n]; + if(f[v] && f[v^1]) continue; + + v = v; + if((!f[v]) && (v!=(s0^1)) && (v!=e0)) { + f[v] = 1; kv_push(uint32_t, *b2, v); + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = 0; k < nv; k++) { + if(av[k].del) continue; + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if((!f[av[k].v])) kv_push(uint32_t, *b1, av[k].v); + } + if(se) break; + } + + v = v^1; + if((!f[v]) && (v!=(s0^1)) && (v!=e0)) { + f[v] = 1; kv_push(uint32_t, *b2, v); + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = 0; k < nv; k++) { + if(av[k].del) continue; + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if((!f[av[k].v])) kv_push(uint32_t, *b1, av[k].v); + } + if(se) break; + } + + if(f[v] && f[v^1] && (!fsk[v>>1])) { + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((f[av[k].v^1]) && (f[av[k].v]) && (!fsk[av[k].v>>1]) && (o < av[k].ol)) o = av[k].ol; + } + o0 = o; + + av = asg_arc_a(g, (v^1)); nv = asg_arc_n(g, (v^1)); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((f[av[k].v^1]) && (f[av[k].v]) && (!fsk[av[k].v>>1]) && (o < av[k].ol)) o = av[k].ol; + } + o1 = o; + + if(o0 + o1 < g->seq[v>>1].len) tot += g->seq[v>>1].len - o0 - o1; + } + + if(tot > max_path) break; + } + + if((!se) && (tot <= max_path)) return tot; + return (uint64_t)-1; +} + + +/** +uint64_t cal_self_close_tang_flank0_back(asg_t *g, uint32_t v0, uint32_t s0, uint32_t e0, uint64_t sf, uint64_t ef, asg32_v *b1, asg32_v *b2, uint8_t *f, uint8_t *fs, uint64_t max_path, uint64_t tot) +{ + if(f[v0]) return tot; + uint64_t se = 0, k, v, o, o0, o1, nv; asg_arc_t *av; + + b1->n = 0; kv_push(uint32_t, *b1, v0); + while(b1->n) { + v = b1->a[--b1->n]; + if(f[v] && f[v^1]) continue; + + o0 = o1 = 0; + + o = 0; + if((v != e0) && (v != (s0^1)) && (!f[v])) { + if((fs[v] != ef) && (fs[v^1] != sf)) { + f[v] = 1; kv_push(uint32_t, *b2, v); + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if((fs[av[k].v] != ef) && (fs[av[k].v^1] != sf)) { + if(f[av[k].v^1]) { + if((!fs[av[k].v]) && (!fs[av[k].v^1]) && (o < av[k].ol)) o = av[k].ol; + continue; + } + kv_push(uint32_t, *b1, av[k].v); + } + } + if(se) break; + } + } + o0 = o; + + v = v^1; + o = 0; + if((v != e0) && (v != (s0^1)) && (!f[v])) { + if((fs[v] != ef) && (fs[v^1] != sf)) { + f[v] = 1; kv_push(uint32_t, *b2, v); + av = asg_arc_a(g, v); nv = asg_arc_n(g, v); + for (k = o = 0; k < nv; k++) { + if(av[k].del) continue; + if((av[k].v == s0) || (av[k].v == (e0^1))) { + se = 1; break; + } + if((fs[av[k].v] != ef) && (fs[av[k].v^1] != sf)) { + if(f[av[k].v^1]) { + if((!fs[av[k].v]) && (!fs[av[k].v^1]) && (o < av[k].ol)) o = av[k].ol; + continue; + } + kv_push(uint32_t, *b1, av[k].v); + } + } + if(se) break; + } + } + o1 = o; + + if((!fs[v]) && (!fs[v^1]) && f[v] && f[v^1] && (o0 + o1 < g->seq[z].len)) { + tot += g->seq[z].len - o0 - o1; + } + if(tot > max_path) break; + } + + if((!se) && (tot <= max_path)) return tot; + return (uint64_t)-1; +} + +uint64_t cal_self_close_tang_flank1(asg_t *g, uint32_t *s, uint32_t sn, uint32_t *e, uint32_t en, uint32_t s0, uint32_t e0, uint64_t sf, uint64_t ef, asg32_v *b1, asg32_v *b2, uint8_t *f, uint8_t *fs, uint64_t max_path, uint64_t tot) +{ + uint64_t k, sk, ek; + for (k = 0, sk = (uint64_t)-1; (k < sn) && (!f[s[k]]) && (!f[s[k]^1]); k++); if(k < sn) sk = k; + for (k = 0, ek = (uint64_t)-1; (k < en) && (!f[e[en-k-1]]) && (!f[e[en-k-1]^1]); k++); if(k < en) ek = en-k-1; + if(sk != (uint64_t)-1) { + for (k = sk + 1; k < sn; k++) fs[s[k]] = 0; + } + if(ek != (uint64_t)-1) { + for (k = ek + 1; k < en; k++) fs[e[en-k-1]] = 0; + } + + if(sk != (uint64_t)-1) { + for (k = sk + 1; k < sn; k++) { + if((!fs[s[k]])) { + if(f[s[k]]) cal_self_close_tang_flank0(g, s[k], s0, e0, sf, ef, b1, b2, f, fs, max_path, tot); + + } + } + } + +} +**/ + + +uint64_t trace_back_tangle_dis(asg_t *g, uint32_t *s, uint32_t *sd, uint32_t sn, uint32_t *e, uint32_t *ed, uint32_t en, asg32_v *b, asg32_v *b1, asg32_v *b2, uint8_t *f, uint8_t *fs, uint8_t *fsk, uint64_t sf, uint64_t ef, uint64_t max_path, uint64_t *is_dip) +{ + uint64_t k, sk, sm, ek, em, s0, slen, e0, elen, tot; b->n = 0; (*is_dip) = 0; + for (sk = sm = 0; sk < sn; sk++) { + if((fs[s[sn-sk-1]] == sf) && (f[s[sn-sk-1]]) && (!f[s[sn-sk-1]^1])) {///a potiential + kv_push(uint32_t, *b, sn-sk-1); sm++; + } + } + if(!sm) return ((uint64_t)-1); + + for (ek = em = 0; ek < en; ek++) { + if((fs[e[ek]] == ef) && (!f[e[ek]]) && (f[e[ek]^1])) {///a potiential + kv_push(uint32_t, *b, ek); em++; + } + } + if(!em) return ((uint64_t)-1); + + uint32_t *si = b->a, *ei = b->a + sm; + for (sk = 0; sk < sm; sk++) { + s0 = s[si[sk]]; slen = sd[si[sk]]; + for (ek = 0; ek < em; ek++) { + e0 = e[ei[ek]]; elen = ed[ei[ek]]; + for (k = 0; k < b2->n; k++) f[b2->a[k]] = 0; + + tot = cal_self_close_tang_flank1(g, s0, e0, b1, b2, f, fsk, max_path + slen + elen); + if((tot != ((uint64_t)-1)) && (tot <= (max_path + slen + elen))) { + if(fsk[s0>>1] && fsk[e0>>1]) { + (*is_dip) = 1; + return 0; + } else { + return ((tot>=(slen + elen))?(tot-(slen + elen)):(0)); + } + } + } + } + return ((uint64_t)-1); +} + +uint64_t cal_self_close_tang_flank(asg_t *g, uint32_t *s, uint32_t *sd, uint32_t sn, uint32_t *e, uint32_t *ed, uint32_t en, asg32_v *b1, asg32_v *b2, asg32_v *b3, uint8_t *f, uint8_t *fs, uint8_t *fsk, uint64_t max_path, uint64_t *is_dip) +{ + (*is_dip) = 0; + if((!sn) || (!en)) return (uint64_t)-1; + + uint64_t z, k, tot, sf = 1, ef = 2, s0, e0; + for (k = 0; k < sn; k++) { + assert(!(fs[s[k]])); fs[s[k]] = sf; + } + s0 = s[0]; + + for (k = 0; k < en; k++) { + assert(!(fs[e[k]])); fs[e[k]] = ef; + } + e0 = e[en-1]; + + b1->n = 0; b2->n = 0; tot = ((uint64_t)-1); + for (k = 0; k < sn; k++) { + if(fs[s[sn-k-1]] != sf) continue; + tot = cal_self_close_tang_flank0(g, s[sn-k-1], s0, e0, sf, ef, b1, b2, f, fs, max_path); + if((tot == ((uint64_t)-1)) || (tot > max_path)) { + tot = ((uint64_t)-1); break; + } else { + tot = trace_back_tangle_dis(g, s, sd, sn, e, ed, en, b3, b1, b2, f, fs, fsk, sf, ef, max_path, is_dip); + if(tot != ((uint64_t)-1)) break; + } + + ///reset s && e + for (z = 0; z < sn; z++) fs[s[z]] = sf; + for (z = 0; z < en; z++) fs[e[z]] = ef; + for (z = 0; z < b2->n; z++) f[b2->a[z]] = 0; + } + + for (k = 0; k < sn; k++) fs[s[k]] = 0; + for (k = 0; k < en; k++) fs[e[k]] = 0; + for (k = 0; k < b2->n; k++) f[b2->a[k]] = 0; + + return tot; +} + + +void prt_scf_dbg_nds(u_trans_t *z, uint32_t *za, uint32_t *zd, uint32_t zn, ma_ug_t *ref, ma_ug_t *hu1, ma_ug_t *hu2, uint64_t s_end, uint64_t e_end, const char *cmd) +{ + uint64_t k, id, rev, ui, uid; ma_utg_t *u; + + id = z->tn; rev = z->rev; + u = ((idu.n)?&(hu1->u.a[id]):&(hu2->u.a[id-hu1->u.n])); + ui = ((idu.n)?1:2); uid = ((idu.n)?(id):(id-hu1->u.n)); + + fprintf(stderr, "\nCMD::%s[M::%s] h%lutg%.6lu%c(%c)\ts::%u\te::%u\ts_end::%lu\te_end::%lu\tref_n::%u\n", cmd, __func__, + ui, uid+1, "lc"[u->circ], "+-"[rev], z->ts, z->te, s_end, e_end, (uint32_t)ref->u.n); + for (k = 0; k < zn; k++) { + fprintf(stderr, "[%u|%u]", (za[k]>>1), (za[k]&1)); + fprintf(stderr, "utg%.6u%c(%c)(d::%u),", (za[k]>>1)+1, "lc"[ref->u.a[(za[k]>>1)].circ], "+-"[za[k]&1], zd[k]); + } + fprintf(stderr, "\n"); +} + + +uint64_t is_small_tangle(u_trans_t *s, u_trans_t *e, ma_ug_t *ref, ma_ug_t *hu1, ma_ug_t *hu2, scaf_res_t *cp1, scaf_res_t *cp2, asg32_v *b1, asg32_v *b2, asg32_v *b3, asg32_v *b4, uint8_t *f, uint8_t *fs, uint8_t *fsk, uint64_t max_len, uint64_t *is_dip) +{ + uint64_t tot; uint32_t *va = NULL, *vd = NULL, *wa = NULL, *wd = NULL, vn = 0, wn = 0; + b1->n = b2->n = b3->n = b4->n = 0; (*is_dip) = 0; + // v = get_ref_nid(s, hu1, hu2, cp1, cp2, 1, 1000); if(v == ((uint64_t)-1)) return ((uint64_t)-1); + // w = get_ref_nid(e, hu1, hu2, cp1, cp2, 0, 1000); if(w == ((uint64_t)-1)) return ((uint64_t)-1); + // if((v>>1) == (w>>1)) return ((uint64_t)-1); + + // fprintf(stderr, "-0-[M::%s]\n", __func__); + vn = get_ref_nid_flank(s, ref, hu1, hu2, cp1, cp2, 1, 1000, max_len, b3); if(!vn) return ((uint64_t)-1); + // fprintf(stderr, "-1-[M::%s]\n", __func__); + wn = get_ref_nid_flank(e, ref, hu1, hu2, cp1, cp2, 0, 1000, max_len, b3); if(!wn) return ((uint64_t)-1); + // fprintf(stderr, "-2-[M::%s]\n", __func__); + va = b3->a; vd = va + vn; wa = vd + vn; wd = wa + wn; + // prt_scf_dbg_nds(s, va, vd, vn, ref, hu1, hu2, 0, 1, "rawS"); + // prt_scf_dbg_nds(e, wa, wd, wn, ref, hu1, hu2, 1, 0, "rawE"); + dedup_tang_flank(va, vd, &vn, wa, wd, &wn); + if((!vn) || (!wn)) return ((uint64_t)-1); + // prt_scf_dbg_nds(s, va, vd, vn, ref, hu1, hu2, 0, 1, "fffS"); + // prt_scf_dbg_nds(e, wa, wd, wn, ref, hu1, hu2, 1, 0, "fffE"); + + tot = cal_self_close_tang_flank(ref->g, va, vd, vn, wa, wd, wn, b1, b2, b4, f, fs, fsk, max_len, is_dip); + if(tot > max_len) tot = ((uint64_t)-1); + return tot; + + // l0 = cal_self_close_tang(ref->g, v, w, b1, b2, f, max_len); if(l0 == ((uint64_t)-1)) return ((uint64_t)-1); + // l1 = cal_self_close_tang(ref->g, w^1, v^1, b1, b2, f, max_len); if(l1 == ((uint64_t)-1)) return ((uint64_t)-1); + // return MIN(l0, l1); +} + + +uint32_t gen_scaf_path_lst(asg_t *g, uint32_t s, asg64_v *b, uint64_t *blen, uint64_t *bf) +{ + + uint32_t v = s, w = 0; uint64_t m, c1 = 0, c2 = 0; + uint32_t kv, kw; (*blen) = 0; (*bf) = 0; + + while (1) { + kv = get_arcs(g, v, &w, 1); + m = v; + if(kv == 1) m |= ((uint64_t)(g->arc[w].ol))<<32; + kv_push(uint64_t, *b, m); + (*blen) += g->seq[v>>1].len + ((kv == 1)?g->arc[w].ol:0); + if(g->seq[v>>1].c == 1) c1++; + else if(g->seq[v>>1].c == 2) c2++; + + if(kv == 0) {(*bf) = (c1>=c2?1:2); return END_TIPS;} + if(kv == 2) {(*bf) = (c1>=c2?1:2); return TWO_OUTPUT;} + if(kv > 2) {(*bf) = (c1>=c2?1:2); return MUL_OUTPUT;} + w = g->arc[w].v; + ///up to here, kv=1 + ///kw must >= 1 + kw = get_arcs(g, w^1, NULL, 0); + v = w; + + if(kw == 2) {(*bf) = (c1>=c2?1:2); return TWO_INPUT;} + if(kw > 2) {(*bf) = (c1>=c2?1:2); return MUL_INPUT;} + if(v == s) {(*bf) = (c1>=c2?1:2); return LOOP;} + } + + (*bf) = (c1>=c2?1:2); + return LONG_TIPS; +} + +uint64_t prt_gen_scaf_path_lst(uint64_t *a, uint64_t n, uint64_t is_circle, ma_ug_t *hu1, ma_ug_t *hu2) +{ + uint64_t k, nl, el, tl, id, rev, ui, uid; ma_utg_t *u; + for (k = nl = el = tl = 0; k < n; k++) { + nl += (a[k]>>32); id = ((uint32_t)a[k])>>1; + u = ((idu.n)?&(hu1->u.a[id]):&(hu2->u.a[id-hu1->u.n])); + el += u->len; + } + tl = el + nl; + fprintf(stderr, "[M::%s]tot::%lu\tel::%lu\tnl::%lu\tis_c::%lu\t#::%lu\t%s\n", __func__, tl, el, nl, is_circle, n, ((n>1)?"scf":"ctg")); + for (k = 0; k < n; k++) { + id = ((uint32_t)a[k])>>1; rev = ((uint32_t)a[k])&1; + u = ((idu.n)?&(hu1->u.a[id]):&(hu2->u.a[id-hu1->u.n])); + ui = ((idu.n)?1:2); nl = (a[k]>>32); + uid = ((idu.n)?(id):(id-hu1->u.n)); + fprintf(stderr, "[M::%s] h%lutg%.6lu%c(%c)\tlen::%u\tNs::%lu\n", __func__, + ui, uid+1, "lc"[u->circ], "+-"[rev], u->len, nl); + } + return tl; +} + +void cal_arr_N50(uint32_t *a, uint32_t n, const char *cmd) +{ + uint64_t i, s, len; + for (i = len = 0; i < n; i++) len += a[i]; + radix_sort_arch32(a, a + n); + + i = n; s = 0; + while (i > 0) { + s += a[--i]; + if(s >= (len>>1)) { + fprintf(stderr, "CMD::%s[M::%s::] N50: %u, Size::%lu\n", cmd, __func__, a[i], len); + break; + } + } +} + +void set_fsk(uint8_t *fsk, uint8_t mm, ul_vec_t *idx) +{ + uint64_t k, m, a_n; uc_block_t *a; + for (k = 0; k < idx->bb.n; k++) { + a = idx->bb.a + idx->bb.a[k].ts; a_n = idx->bb.a[k].te - idx->bb.a[k].ts; + for(m = 0; m < a_n; m++) fsk[a[m].hid] = mm; + } +} + + +void gen_double_scaffold_gfa(ma_ug_t *ref, ma_ug_t *hu1, ma_ug_t *hu2, scaf_res_t *cp1, scaf_res_t *cp2, asg_t *sg, kv_u_trans_t *os, int64_t minNs, kvect_sec_t **sc1, kvect_sec_t **sc2) +{ + order_contig_trans(os); + asg_t *g = asg_init(); uint64_t k, s, e, z, rn, pn, dn, rthres = asm_opt.self_scaf_reliable_min, el, glen; ma_utg_t *u, *ut; u_trans_t *p, *d, *r; + asg_arc_t t, *c; int64_t zs, ze, gl, gmin = -asm_opt.self_scaf_gap_max, gmax = asm_opt.self_scaf_gap_max; uint64_t mf, blen, bf, is_dip; uint32_t sce[2]; sce[0] = 0; sce[1] = 1; + asg32_v b0, b1, b2, b3; kv_init(b0); kv_init(b1); kv_init(b2); kv_init(b3); uint8_t *f, *fs, *fsk; CALLOC(f, (ref->u.n<<1)); CALLOC(fs, (ref->u.n<<1)); CALLOC(fsk, ref->u.n); + + // fprintf(stderr, "[M::%s::] SSS\n", __func__); + for (k = 0; k < os->idx.n; ++k) { + u = ((ku.n)?&(hu1->u.a[k]):&(hu2->u.a[k-hu1->u.n])); + asg_seq_set(g, k, u->len, 0); g->seq[k].c = (ku.n?1:2); + } + + // fprintf(stderr, "[M::%s::] MMM\n", __func__); + for (k = 0; k < os->idx.n; ++k) { + + r = u_trans_a(*os, k); rn = u_trans_n(*os, k); p = d = NULL; pn = dn = 0; + // fprintf(stderr, "[M::%s::] # k:%lu, rn::%lu\n", __func__, k, rn); + if(rn <= 1) continue; + for (z = 0, p = r; (z < rn) && (!(r[z].del)); z++); pn = z; + for (d = r + z; (z < rn) && (r[z].qs != ((uint32_t)-1)); z++); dn = z - pn; + // fprintf(stderr, "[M::%s::] # k:%lu, pn::%lu, dn::%lu\n", __func__, k, pn, dn); + if(pn <= 1) continue;///no scaf + s = e = (uint64_t)-1; + for (z = 0; z < pn; z++) { + if((p[z].f) || (((p[z].qe-p[z].qs) >= rthres) && ((p[z].te-p[z].ts) >= rthres))) { + s = e = z; break; + } + } + for (; z < pn; z++) { + if((p[z].f) || (((p[z].qe-p[z].qs) >= rthres) && ((p[z].te-p[z].ts) >= rthres))) e = z; + } + // fprintf(stderr, "[M::%s::] # k:%lu, s::%lu, e::%lu\n", __func__, k, s, e); + ///[s, e] + u = ((ku.n)?&(hu1->u.a[k]):&(hu2->u.a[k-hu1->u.n])); + set_fsk(fsk, 1, ((ku.n)?&(cp1->a[k]):&(cp2->a[k-hu1->u.n]))); + for (z = 1; z < pn; z++) { + ///cannot scaffold circles + ut = ((p[z-1].tnu.n)?&(hu1->u.a[p[z-1].tn]):&(hu2->u.a[p[z-1].tn-hu1->u.n])); if(ut->circ) continue; + ut = ((p[z].tnu.n)?&(hu1->u.a[p[z].tn]):&(hu2->u.a[p[z].tn-hu1->u.n])); if(ut->circ) continue; + // fprintf(stderr, "[M::%s::] # k:%lu, z::%lu, p[z-1].tn::%u, p[z].tn::%u, hu1->u.n::%u, hu2->u.n::%u\n", __func__, k, z, p[z-1].tn, p[z].tn, (uint32_t)hu1->u.n, (uint32_t)hu2->u.n); + + el = 0; gl = 0; + if((sce[(p[z-1].tnu.n)?0:1] != sce[(p[z-1].qnu.n)?0:1]) && (sce[(p[z].tnu.n)?0:1] != sce[(p[z].qnu.n)?0:1])) { + if((((z-1)>=s)&&((z-1)<=e))&&(((z)>=s)&&((z)<=e))) el = 1; + // fprintf(stderr, "-0-[M::%s::] # k::%lu, z::%lu, el::%lu\n", __func__, k, z, el); + if(el) { + zs = MAX(p[z-1].qs, p[z].qs); + ze = MIN(p[z-1].qe, p[z].qe); + if(is_noisy_ov(MIN(zs, ze), MAX(zs, ze), u->len, 2500000, 0.5, d, dn)) el = 0; + // fprintf(stderr, "-1-[M::%s::] # k::%lu, z::%lu, el::%lu, zs::%ld, ze::%ld\n", __func__, k, z, el, zs, ze); + if(el) { + gl = zs - ze; + if((!(p[z-1].f)) || (!(p[z].f))) { + if(gl >= gmin && gl <= gmax) el = 1; + else el = 0;///gap is too long + } else if(p[z-1].f && p[z].f) { + if(gl <= gmax) el = 1; ///p[z-1] and p[z] are too far away is bad; p[z-1] and p[z] are overlapped is fine + else el = 0; + } + } + // fprintf(stderr, "-2-[M::%s::] # k::%lu, z::%lu, el::%lu, zs::%ld, ze::%ld, p[z-1].f::%u, p[z].f::%u, gmin::%ld, gmax::%ld\n", __func__, k, z, el, zs, ze, p[z-1].f, p[z].f, gmin, gmax); + } + ///final try in graph + // fprintf(stderr, "[M::%s::] # k::%lu, z::%lu, el::%lu, gl::%ld\n", __func__, k, z, el, gl); + is_dip = 0; + + glen = is_small_tangle(&(p[z-1]), &(p[z]), ref, hu1, hu2, cp1, cp2, &b0, &b1, &b2, &b3, f, fs, fsk, 5000000, &is_dip); + // fprintf(stderr, "[M::%s::] # k::%lu, z::%lu, el::%lu, glen::%lu, gl::%ld\n", __func__, k, z, el, glen, gl); + if((glen != (uint64_t)-1) && (((int64_t)glen) <= asm_opt.self_scaf_gap_max)) { + el = 1; if(!is_dip) gl = glen; + } + } + if(gl < minNs) gl = minNs; + if(el) { + ///in anyway the gap cannot be too large + if(gl <= gmax) el = 1; ///p[z-1] and p[z] are too far away is bad; p[z-1] and p[z] are overlapped is fine + else el = 0; + } + + scaf_hit2arc(&(p[z-1]), &(p[z]), 0, el, gl, &t); + c = asg_arc_pushp(g); *c = t; + scaf_hit2arc(&(p[z]), &(p[z-1]), 1, el, gl, &t); + c = asg_arc_pushp(g); *c = t; + } + set_fsk(fsk, 0, ((ku.n)?&(cp1->a[k]):&(cp2->a[k-hu1->u.n]))); + } + asg_cleanup(g); asg_symm(g); + // fprintf(stderr, "[M::%s::] EEE\n", __func__); + + uint64_t v, n_vtx = g->n_seq<<1, nv; asg_arc_t *av; + for (v = 0; v < n_vtx; ++v) { + if (g->seq[v>>1].del) continue; + nv = asg_arc_n(g, v); + if(nv <= 1) continue; + av = asg_arc_a(g, v); + for(k = 0; k < nv; k++) { + if(av[k].del) continue; + av[k].del = 1; + asg_arc_del(g, av[k].v^1, (av[k].ul>>32)^1, 1); + } + } + for (k = 0; k < g->n_arc; k++) { + if(!(g->arc[k].el)) g->arc[k].del = 1; + } + asg_cleanup(g); + + if((ref->u.n<<1) < g->n_seq) REALLOC(f, g->n_seq); + memset(f, 0, sizeof((*f))*g->n_seq); + asg64_v b64; kv_init(b64); kvect_sec_t *i1 = NULL, *i2 = NULL, *im = NULL; uint64_t len[2], mmip; + sec_t *mmi; CALLOC(i1, 1); CALLOC(i2, 1); i1->ctg = hu1; i2->ctg = hu2; + + b0.n = 0; mmip = hu1->u.n; mmip <<= 1; + for (v = 0; v < n_vtx; ++v) { + nv = asg_arc_n(g, v); assert(nv <= 1); + if(f[v>>1]) continue; + if(asg_arc_n(g, (v^1)) != 0) continue; + b64.n = 0; mf = gen_scaf_path_lst(g, v, &b64, &blen, &bf); + assert(mf == END_TIPS); + // rn = prt_gen_scaf_path_lst(b64.a, b64.n, (mf==LOOP)?1:0, hu1, hu2); kv_push(uint32_t, b0, rn); + for (k = len[0] = len[1] = 0; k < b64.n; k++) { + f[((uint32_t)b64.a[k])>>1] = 1; + len[g->seq[((uint32_t)b64.a[k])>>1].c-1] += g->seq[((uint32_t)b64.a[k])>>1].len; + } + im = ((len[0] >= len[1])?i1:i2); kv_pushp(sec_t, (*im), &mmi); memset(mmi, 0, sizeof((*mmi))); + mmi->is_c = 0; mmi->n = mmi->m = b64.n; MALLOC(mmi->a, mmi->n); memcpy(mmi->a, b64.a, sizeof((*(mmi->a)))*mmi->n); + for (k = 0; k < mmi->n; k++) { + if((((uint32_t)mmi->a[k])>>1) >= hu1->u.n) mmi->a[k] -= mmip; + } + } + + for (v = 0; v < n_vtx; ++v) { + nv = asg_arc_n(g, v); assert(nv <= 1); + if(f[v>>1]) continue; + assert((nv == 1) && (asg_arc_n(g, (v^1)) == 1)); + b64.n = 0; mf = gen_scaf_path_lst(g, v, &b64, &blen, &bf); + assert(mf == LOOP); + // rn = prt_gen_scaf_path_lst(b64.a, b64.n, (mf==LOOP)?1:0, hu1, hu2); kv_push(uint32_t, b0, rn); + for (k = len[0] = len[1] = 0; k < b64.n; k++) { + f[((uint32_t)b64.a[k])>>1] = 1; + len[g->seq[((uint32_t)b64.a[k])>>1].c-1] += g->seq[((uint32_t)b64.a[k])>>1].len; + } + im = ((len[0] >= len[1])?i1:i2); + im = ((len[0] >= len[1])?i1:i2); kv_pushp(sec_t, (*im), &mmi); memset(mmi, 0, sizeof((*mmi))); + mmi->is_c = 1; mmi->n = mmi->m = b64.n; MALLOC(mmi->a, mmi->n); memcpy(mmi->a, b64.a, sizeof((*(mmi->a)))*mmi->n); + for (k = 0; k < mmi->n; k++) { + if((((uint32_t)mmi->a[k])>>1) >= hu1->u.n) mmi->a[k] -= mmip; + } + } + kv_destroy(b64); + // cal_arr_N50(b0.a, b0.n, "Scf"); + + // b1.n = 0; + // for(k = 0; k < hu1->u.n; k++) kv_push(uint32_t, b1, hu1->u.a[k].len); + // for(k = 0; k < hu2->u.n; k++) kv_push(uint32_t, b1, hu2->u.a[k].len); + // cal_arr_N50(b1.a, b1.n, "Ctg"); + + kv_destroy(b0); kv_destroy(b1); kv_destroy(b2); kv_destroy(b3); free(f); free(fs); free(fsk); asg_destroy(g); + (*sc1) = i1; (*sc2) = i2; +} + +kv_u_trans_t *gen_shallow_ref_trans(bubble_type *bu, scaf_res_t *cp0, scaf_res_t *cp1, asg_t *sg, ma_ug_t *ref, ma_sub_t* cover, ma_hit_t_alloc* src, ma_hit_t_alloc* rev, +long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t) +{ + kvec_asg_arc_t_warp ne; kv_init(ne.a); kv_u_trans_t *p = NULL; CALLOC(p, 1); + ma_ug_t *cref = gen_shallow_clean_ug(ref, bu, cp0, cp1); + asg_t *csg = copy_read_graph(sg); hap_cov_t *cov = NULL; ne.a.n = 0; + + adjust_utg_by_primary(&cref, csg, TRIO_THRES, src, rev, cover, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, &ne, &cov, b_mask_t, 0, 0); + + ma_ug_destroy(cref); asg_destroy(csg); kv_destroy(ne.a); + p->n = cov->t_ch->k_trans.n; p->m = cov->t_ch->k_trans.m; p->a = cov->t_ch->k_trans.a; + p->idx.n = cov->t_ch->k_trans.idx.n; p->idx.m = cov->t_ch->k_trans.idx.m; p->idx.a = cov->t_ch->k_trans.idx.a; + cov->t_ch->k_trans.n = cov->t_ch->k_trans.m = 0; cov->t_ch->k_trans.a = NULL; + cov->t_ch->k_trans.idx.n = cov->t_ch->k_trans.idx.m = 0; cov->t_ch->k_trans.idx.a = NULL; + destory_hap_cov_t(&cov); + return p; +} + +kv_u_trans_t *gen_deep_ref_trans(kv_u_trans_t *in, scaf_res_t *cp0, scaf_res_t *cp1, asg_t *sg, ma_ug_t *ref, ma_sub_t* cover, ma_hit_t_alloc* src, ma_hit_t_alloc* rev, +long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t, ug_opt_t *opt) +{ + kvec_asg_arc_t_warp ne; kv_init(ne.a); kv_u_trans_t *p = NULL; CALLOC(p, 1); + ma_ug_t *cref = gen_clean_ug(ref, cp0, cp1); ma_ug_t *ccref = copy_untig_graph(cref); + asg_t *csg = copy_read_graph(sg); hap_cov_t *cov = NULL; ne.a.n = 0; + // print_debug_gfa(sg, cref, cover, "cl.sb.utg", src, ruIndex, opt->max_hang, opt->min_ovlp, 0, 0, 0); + + adjust_utg_by_primary(&cref, csg, TRIO_THRES, src, rev, cover, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, &ne, &cov, b_mask_t, 1, 0); + + ma_ug_destroy(cref); asg_destroy(csg); kv_destroy(ne.a); + if(in) { + kv_resize(u_trans_t, cov->t_ch->k_trans, cov->t_ch->k_trans.n + in->n); + memcpy(cov->t_ch->k_trans.a + cov->t_ch->k_trans.n, in->a, (in->n*sizeof((*(in->a))))); + cov->t_ch->k_trans.n += in->n; + clean_u_trans_t_idx_filter_adv(&(cov->t_ch->k_trans), ref, sg, 0.6, 1); + } + + filter_u_trans(&(cov->t_ch->k_trans), asm_opt.is_bub_trans, asm_opt.is_topo_trans, asm_opt.is_read_trans, asm_opt.is_base_trans); + if(asm_opt.is_base_trans) { + ma_ug_seq(ccref, sg, cover, src, NULL, max_hang, min_ovlp, NULL, 0);///polish + trans_base_infer(ccref, sg, opt, &(cov->t_ch->k_trans), NULL); + ma_ug_destroy(ccref); + } + clean_u_trans_t_idx_filter_adv(&(cov->t_ch->k_trans), ref, sg, 0.6, 0); + + p->n = cov->t_ch->k_trans.n; p->m = cov->t_ch->k_trans.m; p->a = cov->t_ch->k_trans.a; + p->idx.n = cov->t_ch->k_trans.idx.n; p->idx.m = cov->t_ch->k_trans.idx.m; p->idx.a = cov->t_ch->k_trans.idx.a; + cov->t_ch->k_trans.n = cov->t_ch->k_trans.m = 0; cov->t_ch->k_trans.a = NULL; + cov->t_ch->k_trans.idx.n = cov->t_ch->k_trans.idx.m = 0; cov->t_ch->k_trans.idx.a = NULL; + destory_hap_cov_t(&cov); + return p; +} + +ma_ug_t *merge_utgs(ma_ug_t *in0, ma_ug_t *in1) +{ + uint64_t k, m[2]; ma_utg_t *su = NULL, *du = NULL; + m[0] = ((uint64_t)(in0->g->n_seq))<<1; + m[1] = ((uint64_t)(in0->g->n_seq))<<33; + ma_ug_t *ug = NULL; CALLOC(ug, 1); ug->g = asg_init(); + ug->g->n_F_seq = in0->g->n_F_seq + in1->g->n_F_seq; + ug->g->r_seq = in0->g->r_seq + in1->g->r_seq; + ug->g->m_seq = ug->g->n_seq = in0->g->n_seq + in1->g->n_seq; + MALLOC(ug->g->seq, ug->g->n_seq); + memcpy(ug->g->seq, in0->g->seq, sizeof((*(ug->g->seq)))*in0->g->n_seq); + memcpy(ug->g->seq+in0->g->n_seq, in1->g->seq, sizeof((*(ug->g->seq)))*in1->g->n_seq); + + ug->g->m_arc = ug->g->n_arc = in0->g->n_arc + in1->g->n_arc; + MALLOC(ug->g->arc, ug->g->n_arc); + memcpy(ug->g->arc, in0->g->arc, sizeof((*(ug->g->arc)))*in0->g->n_arc); + for(k = 0; k < in1->g->n_arc; k++) { + ug->g->arc[k+in0->g->n_arc] = in1->g->arc[k]; + ug->g->arc[k+in0->g->n_arc].v += m[0]; + ug->g->arc[k+in0->g->n_arc].ul += m[1]; + } + asg_cleanup(ug->g); asg_symm(ug->g); + + ug->u.m = ug->u.n = in0->u.n + in1->u.n; + MALLOC(ug->u.a, ug->u.n); + for (k = 0; k < in0->u.n; k++) { + su = &(in0->u.a[k]); du = &(ug->u.a[k]); + (*du) = (*su); + du->s = NULL; du->a = NULL; du->m = du->n = su->n; + MALLOC(du->a, du->n); memcpy(du->a, su->a, sizeof((*(du->a)))*du->n); + } + for (k = 0; k < in1->u.n; k++) { + su = &(in1->u.a[k]); du = &(ug->u.a[k + in0->u.n]); + (*du) = (*su); + du->s = NULL; du->a = NULL; du->m = du->n = su->n; + MALLOC(du->a, du->n); memcpy(du->a, su->a, sizeof((*(du->a)))*du->n); + } + + return ug; +} + +scaf_res_t *merge_scaf_res(scaf_res_t *in0, scaf_res_t *in1) +{ + uint64_t k; ul_vec_t *m; ul_vec_t *s; + scaf_res_t *p = NULL; CALLOC(p, 1); + p->n = p->m = in0->n + in1->n; + CALLOC(p->a, p->n); + for(k = 0; k < in0->n; k++) { + m = &(p->a[k]); s = &(in0->a[k]); + *m = *s; + + m->r_base.a = NULL; + if(m->r_base.m) { + MALLOC(m->r_base.a, m->r_base.m); + memcpy(m->r_base.a, s->r_base.a, sizeof((*(m->r_base.a)))*m->r_base.m); + } + + m->bb.a = NULL; + if(m->bb.m) { + MALLOC(m->bb.a, m->bb.m); + memcpy(m->bb.a, s->bb.a, sizeof((*(m->bb.a)))*m->bb.m); + } + + m->N_site.a = NULL; + if(m->N_site.m) { + MALLOC(m->N_site.a, m->N_site.m); + memcpy(m->N_site.a, s->N_site.a, sizeof((*(m->N_site.a)))*m->N_site.m); + } + } + + for(k = 0; k < in1->n; k++) { + m = &(p->a[k + in0->n]); s = &(in1->a[k]); + *m = *s; + + m->r_base.a = NULL; + if(m->r_base.m) { + MALLOC(m->r_base.a, m->r_base.m); + memcpy(m->r_base.a, s->r_base.a, sizeof((*(m->r_base.a)))*m->r_base.m); + } + + m->bb.a = NULL; + if(m->bb.m) { + MALLOC(m->bb.a, m->bb.m); + memcpy(m->bb.a, s->bb.a, sizeof((*(m->bb.a)))*m->bb.m); + } + + m->N_site.a = NULL; + if(m->N_site.m) { + MALLOC(m->N_site.a, m->N_site.m); + memcpy(m->N_site.a, s->N_site.a, sizeof((*(m->N_site.a)))*m->N_site.m); + } + } + + return p; +} + +kv_u_trans_t *gen_contig_trans_func(ma_ug_t *ref, ma_ug_t *hu0, ma_ug_t *hu1, scaf_res_t *cp0, scaf_res_t *cp1, asg_t *sg, bubble_type *bu, ug_opt_t *opt, kv_u_trans_t *bd, uint8_t is_sep) +{ + + kv_u_trans_t *os; CALLOC(os, 1); + if(is_sep) { + gen_contig_trans(opt, sg, hu1, cp1, hu0, cp0, ref, bd, hu0->u.n, 0, bu, os); + gen_contig_trans(opt, sg, hu0, cp0, hu1, cp1, ref, bd, 0, hu0->u.n, bu, os); + } else { + ma_ug_t *mg = merge_utgs(hu0, hu1); scaf_res_t *ms = merge_scaf_res(cp0, cp1); + gen_contig_self(opt, sg, mg, ms, ref, bd, hu0->u.n, bu, os); + ma_ug_destroy(mg); destroy_scaf_res_t(ms); + } + + kt_u_trans_t_idx(os, hu0->u.n + hu1->u.n); + update_ctg_trans_cc(os); + return os; +} + +void double_scaffold(ma_ug_t *ref, ma_ug_t *hu0, ma_ug_t *hu1, scaf_res_t *cp0, scaf_res_t *cp1, asg_t *sg, ma_sub_t* cover, ma_hit_t_alloc* src, ma_hit_t_alloc* rev, +long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t, ug_opt_t *opt, kvect_sec_t **sc0, kvect_sec_t **sc1) +{ + bubble_type *bu = NULL; uint8_t *bf = NULL; + bu = gen_bubble_chain(sg, ref, opt, &bf, 0); free(bf); + + kv_u_trans_t *bs = gen_shallow_ref_trans(bu, cp0, cp1, sg, ref, cover, src, rev, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, b_mask_t); + kv_u_trans_t *bd = gen_deep_ref_trans(bs, cp0, cp1, sg, ref, cover, src, rev, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, b_mask_t, opt); + kv_destroy((*bs)); kv_destroy(bs->idx); free(bs); + + kv_u_trans_t *os = gen_contig_trans_func(ref, hu0, hu1, cp0, cp1, sg, bu, opt, bd, 0/**1**/); + destory_bubbles(bu); free(bu); kv_destroy((*bd)); kv_destroy(bd->idx); free(bd); + + // print_ctg_trans_ovlp(os, ref, hu0, cp0, hu1, cp1); + gen_double_scaffold_gfa(ref, hu0, hu1, cp0, cp1, sg, os, 100, sc0, sc1); + kv_destroy((*os)); kv_destroy(os->idx); free(os); +} + + +void gen_self_scaf(ug_opt_t *opt, ma_ug_t *hu0, ma_ug_t *hu1, asg_t *sg, ma_sub_t *cov, ma_hit_t_alloc *src, ma_hit_t_alloc *rev, R_to_U *ri, +long long tipsLen, float tip_drop_ratio, long long stops_threshold, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t, +kvect_sec_t **sc0, kvect_sec_t **sc1) +{ + ma_ug_t *ug = NULL; + + ug = ma_ug_gen(sg); + + ///print_debug_gfa(sg, ug, cov, "sb.utg", src, ri, opt->max_hang, opt->min_ovlp, 0, 0, 0); + + ///fprintf(stderr, "[M::%s] hu0\n", __func__); + scaf_res_t *cp0 = gen_contig_path(opt, sg, hu0, ug); ///prt_scaf_res_t(cp0, ug, hu0, "h1tg"); + ///fprintf(stderr, "[M::%s] hu1\n", __func__); + scaf_res_t *cp1 = gen_contig_path(opt, sg, hu1, ug); ///prt_scaf_res_t(cp1, ug, hu1, "h2tg"); + + double_scaffold(ug, hu0, hu1, cp0, cp1, sg, cov, src, rev, tipsLen, tip_drop_ratio, stops_threshold, ri, chimeric_rate, drop_ratio, max_hang, min_ovlp, b_mask_t, opt, sc0, sc1); + + ma_ug_destroy(ug); destroy_scaf_res_t(cp0); destroy_scaf_res_t(cp1); + +} + +void destory_kvect_sec_t(kvect_sec_t *p) +{ + uint64_t k; + for(k = 0; k < p->n; k++) free(p->a[k].a); + free(p->a); +} + +void output_trio_graph_joint(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, +ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, +long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, +int min_ovlp, long long gap_fuzz, bub_label_t* b_mask_t, ma_ug_t **rhu0, ma_ug_t **rhu1, ug_opt_t *opt) +{ + ma_ug_t *hu0 = NULL, *hu1 = NULL; kvec_asg_arc_t_warp arcs0, arcs1; + memset(&arcs0, 0, sizeof(arcs0)); memset(&arcs1, 0, sizeof(arcs1)); + + reduce_hamming_error_adv(NULL, sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, ruIndex, NULL); + + hu0 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, + reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, + drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, &arcs0); + + hu1 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, + reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, + drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, &arcs1); + + dedup_idx_t *hidx0 = NULL, *hidx1 = NULL; uint8_t *ff; CALLOC(ff, sg->n_seq); + hidx0 = gen_dedup_idx_t(hu0, sg); hidx1 = gen_dedup_idx_t(hu1, sg); + update_recover_atg_cov(); + + uint64_t dedup_base = 0, miss_base = 0, s; + s = dedup_exact_ug(hidx1, hidx0, coverage_cut, sources, ruIndex, ff, FATHER); dedup_base += s; + s = dedup_exact_ug(hidx0, hidx1, coverage_cut, sources, ruIndex, ff, MOTHER); dedup_base += s; + destroy_dedup_idx_t(hidx0); destroy_dedup_idx_t(hidx1); + + s = append_miss_nid(sg, hu0, hu1, ff, PHASE_MISS_LEN, PHASE_MISS_N); miss_base += s; + // s = append_miss_nid(sg, hu0, hu1, ff, PHASE_MISS_SLEN, PHASE_MISS_SN); miss_base += s; + free(ff); + + renew_utg((&hu0), sg, &arcs0); renew_utg((&hu1), sg, &arcs1); + fprintf(stderr, "[M::%s] dedup_base::%lu, miss_base::%lu\n", __func__, dedup_base, miss_base); + + if((asm_opt.self_scaf) && (!rhu0) && (!rhu1)) { + kvect_sec_t *sc0 = NULL, *sc1 = NULL; + gen_self_scaf(opt, hu0, hu1, sg, coverage_cut, sources, reverse_sources, ruIndex, tipsLen, tip_drop_ratio, stops_threshold, chimeric_rate, drop_ratio, max_hang, min_ovlp, b_mask_t, &sc0, &sc1); + + output_hap_sc_graph(sc0, sg, &arcs0, coverage_cut, output_file_name, FATHER, sources, ruIndex, max_hang, min_ovlp, NULL); + destory_kvect_sec_t(sc0); free(sc0); ma_ug_destroy(hu0); kv_destroy(arcs0.a); + + output_hap_sc_graph(sc1, sg, &arcs1, coverage_cut, output_file_name, MOTHER, sources, ruIndex, max_hang, min_ovlp, NULL); + destory_kvect_sec_t(sc1); free(sc1); ma_ug_destroy(hu1); kv_destroy(arcs1.a); + } else { + if(!rhu0) { + output_hap_graph(hu0, sg, &arcs0, coverage_cut, output_file_name, FATHER, sources, ruIndex, max_hang, min_ovlp, NULL); + ma_ug_destroy(hu0); + } else { + (*rhu0) = hu0; + } + kv_destroy(arcs0.a); + + + if(!rhu1) { + output_hap_graph(hu1, sg, &arcs1, coverage_cut, output_file_name, MOTHER, sources, ruIndex, max_hang, min_ovlp, NULL); + ma_ug_destroy(hu1); + } else { + (*rhu1) = hu1; // ma_ug_destroy(hu1); + } + kv_destroy(arcs1.a); } - kv_destroy(arcs1.a); } void output_read_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, long long n_read) diff --git a/Overlaps.h b/Overlaps.h index 955e528..7c5d46d 100644 --- a/Overlaps.h +++ b/Overlaps.h @@ -1176,7 +1176,7 @@ void extract_sub_overlaps(uint32_t i_tScur, uint32_t i_tEcur, uint32_t i_tSpre, uint32_t tn, kv_u_trans_hit_t* ktb, uint32_t bn); void clean_u_trans_t_idx(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); void clean_u_trans_t_idx_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); -void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g); +void clean_u_trans_t_idx_filter_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, double sc_sec_rate, uint64_t uniform_only); uint32_t test_dbug(ma_ug_t* ug, FILE* fp); void write_dbug(ma_ug_t* ug, FILE* fp); int asg_arc_identify_simple_bubbles_multi(asg_t *g, bub_label_t* x, int check_cross); diff --git a/hic.cpp b/hic.cpp index 7d46e88..fa1b3e6 100644 --- a/hic.cpp +++ b/hic.cpp @@ -17744,7 +17744,7 @@ void trio_phasing_refine(ma_ug_t *iug, asg_t* sg, kv_u_trans_t *ref, ug_opt_t *o uint64_t ug_n0 = ug->g->n_seq, k, i, ul, cis_n, trans_n, w_n, tot_hap, tot_r, frid, mrid, flag; kv_u_trans_t in; memset(&in, 0, sizeof(in)); ma_utg_t *p; u_trans_t *z; int64_t fh, mh; bubble_type *bub = gen_bubble_chain(sg, ug, opt, &bf, 0); free(bf); - clean_u_trans_t_idx_filter_adv(ref, ug, sg); + clean_u_trans_t_idx_filter_adv(ref, ug, sg, 0.95, 0); ///update ug itself ul = FATHER; frid = ug->g->n_seq; asg_seq_set(ug->g, frid, ul, 0); diff --git a/inter.cpp b/inter.cpp index 1b49004..8ef5250 100644 --- a/inter.cpp +++ b/inter.cpp @@ -23,6 +23,9 @@ KSEQ_INIT(gzFile, gzread) #define oreg_xe_lt(a, b) (((uint64_t)(a).x_pos_e<<32|(a).x_pos_s) < ((uint64_t)(b).x_pos_e<<32|(b).x_pos_s)) KSORT_INIT(or_xe, overlap_region, oreg_xe_lt) +#define u_trans_qs_key0(a) ((a).qs) +KRADIX_SORT_INIT(u_trans_qs0, u_trans_t, u_trans_qs_key0, member_size(u_trans_t, qs)) + void ha_get_ul_candidates_interface(ha_abufl_t *ab, int64_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, overlap_region_alloc *overlap_list_hp, Candidates_list *cl, double bw_thres, int max_n_chain, int keep_whole_chain, kvec_t_u8_warp* k_flag, kvec_t_u64_warp* chain_idx, overlap_region* f_cigar, kvec_t_u64_warp* dbg_ct, st_mt_t *sp, uint32_t high_occ, void *km); void ul_map_lchain(ha_abufl_t *ab, uint32_t rid, char* rs, uint64_t rl, uint64_t mz_w, uint64_t mz_k, const ul_idx_t *uref, overlap_region_alloc *overlap_list, Candidates_list *cl, double bw_thres, @@ -407,6 +410,8 @@ typedef struct { // global data structure for kt_pipeline() ma_ug_t *gfa; bubble_type *bub; kv_u_trans_t *ta; + uint64_t mm; + uint64_t soff; } ctdat_t; @@ -3406,7 +3411,7 @@ int64_t get_ecov_adv(const ul_idx_t *uref, const ug_opt_t *uopt, uint32_t v, uin for (i = 0; i < nv; i++) { if(av[i].del || av[i].v != w) continue; dt = av[i].ol; - // if(v==1772 && w==1769) fprintf(stderr, "+++v:%u, w:%u, ou:%u\n", v, w, av[i].ou); + // if((v>>1)==25288) fprintf(stderr, "+++v:%u, w:%u, ou:%ld\n", v, w, dt); // if((v>>1) == 3012 && (w>>1) == 3011) fprintf(stderr, "******************\n"); if(contain_off) { (*contain_off) = av[i].ou; @@ -5644,7 +5649,7 @@ uint64_t primary_chain_check(uint64_t *idx, int64_t idx_n, mg_lchain_t *a) int64_t gl_chain_linear(const ul_idx_t *uref, const ma_ug_t *ug, vec_mg_lchain_t *lc, vec_mg_lchain_t *sw, int64_t qlen, const ug_opt_t *uopt, int64_t bw, double ng_diff_thre, st_mt_t *bf, int64_t max_skip, int64_t max_iter, int64_t max_dis, int32_t *p, int64_t *f, -int64_t *t, int64_t n_ext) +int64_t *t, int64_t n_ext, uint64_t id) { int64_t i, j, lc_n = lc->n, mm_ovlp, x, m_idx, m_sc, qo, n_skip, k, k0, n_u, n_v, ni; int64_t max_f, st, max_ii, sc, csc, mm_sc, mm_idx, end_j, max, n_v0; @@ -5662,12 +5667,18 @@ int64_t *t, int64_t n_ext) mm_sc = csc; mm_idx = -1; n_skip = 0; end_j = -1; if ((x-st) > max_iter) st = x-max_iter; + // if(id == 123/** || id == 192**/) { + // fprintf(stderr, "[M::id->%lu::i->%ld::utg%.6d%c] q::[%u, %u); t::[%u, %u); st->%ld; x->%ld\n", id, i, (li->v>>1)+1, "lc"[ug->u.a[li->v>>1].circ], li->qs, li->qe, li->rs, li->re, st, x); + // } for (j = x; j >= st; --j) { // collect potential destination vertices lj = &lc->a[j]; if(lj->qe+G_CHAIN_INDEL <= li->qs) break; if(lj->qs >= li->qs) continue; set_ul_ov_t_by_mg_lchain_t(&uj, lj); qo = infer_rovlp(&ui, &uj, NULL, NULL, NULL, (ma_ug_t *)ug); ///overlap length in query (UL read) + // if(id == 123/** || id == 192**/) { + // fprintf(stderr, "[M::id->%lu::j->%ld] qo->%ld\n", id, j, qo); + // } if(/**li->v!=lj->v &&**/get_ecov_adv(uref, uopt, li->v^1, lj->v^1, bw, ng_diff_thre, qo, 0, NULL)) { sc = csc + f[j]; if(sc > mm_sc) { @@ -5828,7 +5839,7 @@ st_mt_t *bf, Chain_Data* dp, int64_t max_skip, int64_t max_iter, int64_t max_dis t = dp->tmp; p = dp->score; f = dp->pre; if(ng_diff_thre >= 0) { m_idx = gl_chain_linear(uref, ug, lc, sw, qlen, uopt, bw, ng_diff_thre, bf, - max_skip, max_iter, max_dis, p, f, t, n_ext); + max_skip, max_iter, max_dis, p, f, t, n_ext, -1); if(diff_thre < 0) return m_idx; if(m_idx >= 0 && primary_chain_check(bf->a, bf->n, lc->a)) return m_idx; else m_idx = -1; @@ -5973,7 +5984,7 @@ st_mt_t *bf, Chain_Data* dp, int64_t max_skip, int64_t max_iter, int64_t max_dis int64_t ctg_chain_graph(void *km, const ul_idx_t *uref, const ma_ug_t *ug, vec_mg_lchain_t *lc, vec_mg_lchain_t *sw, vec_mg_path_dst_t *dst, vec_sp_node_t *out, vec_mg_pathv_t *path, int64_t qlen, const ug_opt_t *uopt, int64_t bw, double ng_diff_thre, uint64_t *srt, -st_mt_t *bf, Chain_Data* dp, int64_t max_skip, int64_t max_iter, int64_t max_dis) +st_mt_t *bf, Chain_Data* dp, int64_t max_skip, int64_t max_iter, int64_t max_dis, uint64_t id) { bf->n = 0; if(lc->n == 0) return 0; @@ -6008,7 +6019,7 @@ st_mt_t *bf, Chain_Data* dp, int64_t max_skip, int64_t max_iter, int64_t max_dis t = dp->tmp; p = dp->score; f = dp->pre; m_idx = gl_chain_linear(uref, ug, lc, sw, qlen, uopt, bw, ng_diff_thre, bf, - max_skip, max_iter, max_dis, p, f, t, lc->n); + max_skip, max_iter, max_dis, p, f, t, lc->n, id); // if(m_idx >= 0 && primary_chain_check(bf->a, bf->n, lc->a)) return m_idx; // else m_idx = -1; @@ -13826,10 +13837,10 @@ void dump_linear_chain(ma_ug_t *ug, kv_ul_ov_t *lidx, vec_mg_lchain_t *res, int6 p->off = i; p->score = lidx->a[i].sec; p->qs = lidx->a[i].qs; p->qe = lidx->a[i].qe; p->rs = lidx->a[i].ts; p->re = lidx->a[i].te; - // if(ulid == 14714) { - // fprintf(stderr, "+[M::%s::]\tutg%.6dl(%c)\tq::[%d, %d)\tqlen::%ld\tt::[%d, %d)\ttlen::%u\n", __func__, + // if(ulid == 123) { + // fprintf(stderr, "+[M::%s::]\tutg%.6dl(%c)\tq::[%d, %d)\tqlen::%ld\tt::[%d, %d)\ttlen::%u\tsc::%d\n", __func__, // (int32_t)(p->v>>1)+1, "+-"[p->v&1], p->qs, p->qe, qlen, - // p->rs, p->re, ug->g->seq[p->v>>1].len); + // p->rs, p->re, ug->g->seq[p->v>>1].len, p->score); // } extend_end_coord(p, NULL, qlen, ug->g->seq[p->v>>1].len, &iqs, &iqe, &its, &ite); p->qs = iqs; p->qe = iqe; p->rs = its; p->re = ite; @@ -16036,7 +16047,7 @@ int64_t dp_max_skip, int64_t dp_max_iter, int64_t dp_max_dis) } -void push_ctg_res(ul_vec_t *rch, vec_mg_lchain_t *uc) +void push_ctg_res(ul_vec_t *rch, vec_mg_lchain_t *uc, uint64_t id) { // fprintf(stderr, "\n++[M::%s::%.*s(id:%ld), len:%u]\n", __func__, // UL_INF.nid.a[ulid].n, UL_INF.nid.a[ulid].a, ulid, rch->rlen); @@ -16045,14 +16056,11 @@ void push_ctg_res(ul_vec_t *rch, vec_mg_lchain_t *uc) for (k = 0, rch->bb.n = 0, tt = 0; k < ucn; k += ix->cnt + 1) { ix = &(uc->a[k]); assert(ix->v == (uint32_t)-1); kv_pushp(uc_block_t, rch->bb, &p); memset(p, 0, sizeof((*p))); - p->hid = (uint32_t)-1; p->qs = ix->qs; p->qe = ix->qe; p->ts = ix->cnt; tt += ix->cnt; + p->hid = k/**(uint32_t)-1**/; p->qs = ix->qs; p->qe = ix->qe; p->ts = ix->cnt; tt += ix->cnt; - // if(ulid == 14714) { - // fprintf(stderr, "\n[M::%s::ucn->%ld, k->%ld, kcnt->%d]\n", __func__, ucn, k, ix->cnt); - // print_debug_gchain(uref, uc->a + k + 1, ix->cnt, rch); + // if(id == 206) { + // fprintf(stderr, "init->[M::%s::ucn->%lu, k->%lu, kcnt->%d]\n", __func__, ucn, k, ix->cnt); // } - // gen_rovlp_chain_by_ul(rg, rch, uref, raw_idx, raw_chn, uc->a + k + 1, ix->cnt, swap, dp, - // dp_max_skip, dp_max_iter, dp_max_dis, ulid); } radix_sort_uc_block_t_qe_srt(rch->bb.a, rch->bb.a + rch->bb.n); @@ -16062,21 +16070,42 @@ void push_ctg_res(ul_vec_t *rch, vec_mg_lchain_t *uc) s = tt; e = tt + rch->bb.a[k].ts; rch->bb.a[k].ts = s; rch->bb.a[k].te = e; tt = e; - } - assert(tt <= rch->bb.m); + // if(id == 206) { + // fprintf(stderr, "mid->[M::%s::rch->bb.n->%u, k->%lu] [%lu, %lu)\n", + // __func__, (uint32_t)rch->bb.n, k, s, e); + // } - for (k = 0, tt = 0; k < ucn; k += ix->cnt + 1, tt++) { - ix = &(uc->a[k]); assert(ix->v == (uint32_t)-1); - rr = rch->bb.a + rch->bb.a[tt].ts; rrn = rch->bb.a[tt].te - rch->bb.a[tt].ts; - src = uc->a + k + 1; assert((uint32_t)ix->cnt == rrn); + ix = &(uc->a[rch->bb.a[k].hid]); assert(ix->v == (uint32_t)-1); + rr = rch->bb.a + rch->bb.a[k].ts; rrn = rch->bb.a[k].te - rch->bb.a[k].ts; + src = uc->a + rch->bb.a[k].hid + 1; assert((uint32_t)ix->cnt == rrn); for(z = 0; z < rrn; z++) { rr[z].hid = src[z].v>>1; rr[z].rev = src[z].v&1; rr[z].qs = src[z].qs; rr[z].qe = src[z].qe; rr[z].ts = src[z].rs; rr[z].te = src[z].re; - rr[z].aidx = k; + rr[z].aidx = rch->bb.a[k].hid; } + rch->bb.a[k].hid = (uint32_t)-1; + } + assert(tt <= rch->bb.m); + + // for (k = 0, tt = 0; k < ucn; k += ix->cnt + 1, tt++) { + // ix = &(uc->a[k]); assert(ix->v == (uint32_t)-1); + // rr = rch->bb.a + rch->bb.a[tt].ts; rrn = rch->bb.a[tt].te - rch->bb.a[tt].ts; + // src = uc->a + k + 1; + // if(!((uint32_t)ix->cnt == rrn)) { + // fprintf(stderr, "\n++[M::%s::(id::%lu)] ix->cnt::%d, rrn::%lu\n", __func__, id, ix->cnt, rrn); + // } + // assert((uint32_t)ix->cnt == rrn); + // for(z = 0; z < rrn; z++) { + // rr[z].hid = src[z].v>>1; + // rr[z].rev = src[z].v&1; + // rr[z].qs = src[z].qs; rr[z].qe = src[z].qe; + // rr[z].ts = src[z].rs; rr[z].te = src[z].re; + // rr[z].aidx = k; + // } + // } ///up to now, given a in swap ///x->ts and x->te are the coordinates in unitig (x->score>>1), instead of HiFi read (x->v>>1) ///x->qs and x->qe are the coordinates in UL @@ -16234,17 +16263,37 @@ uint64_t rov2uov_ctg(uint64_t rid, const ul_idx_t *uref, utg_rid_dt *ru_map, uc_ uint64_t get_unique_rctg_aln(const ul_idx_t *uref, uint64_t v, uint64_t vi, uint64_t vs, uint64_t ulen, ul_ov_t *ures, kv_ul_ov_t *rr, uint32_t sec) { - utg_rid_dt *a = NULL; uint64_t a_n, k, rrn = 0; uc_block_t z; ul_ov_t p; + utg_rid_dt *a = NULL; uint64_t a_n, k, rrn = 0, over_bd = 0; uc_block_t z; ul_ov_t p; p.tn = p.qn = ((uint32_t)-1); if(ures) ures->tn = ures->qn = ((uint32_t)-1); if(IS_SCAF_READ(R_INF, (v>>1))) return 0;///actually no scaf nodes within the assembly graph a = get_r_ug_region(uref->r_ug, &a_n, v>>1); if(!a) return 0; - - - z.ts = 0; z.te = Get_READ_LENGTH(R_INF, (v>>1)); + z.qs = vs; z.qe = vs + Get_READ_LENGTH(R_INF, (v>>1)); - if(z.qs > ulen) z.qs = ulen; if(z.qe > ulen) z.qe = ulen; + if(z.qs > ulen) { + z.qs = ulen; + over_bd = 1; + } + + if(z.qe > ulen) { + z.qe = ulen; + over_bd = 1; + } + z.hid = v>>1; z.rev = v&1; z.el = 1; + z.ts = 0; z.te = Get_READ_LENGTH(R_INF, (v>>1)); + if(over_bd) {//////need to handle it especially for circles + over_bd = z.qe - z.qs; + if(over_bd > Get_READ_LENGTH(R_INF, (v>>1))) { + over_bd = Get_READ_LENGTH(R_INF, (v>>1)); + } + if(!(z.rev)) { + z.ts = 0; z.te = over_bd; + } else { + z.ts = Get_READ_LENGTH(R_INF, (v>>1))-over_bd; z.te = Get_READ_LENGTH(R_INF, (v>>1)); + } + } + if(z.qe <= z.qs || z.te <= z.ts) return 0; for (k = rrn = 0; k < a_n; k++) { @@ -16266,7 +16315,7 @@ uint64_t cmp_shrink_ul_ov_t(ul_ov_t *a0, ul_ov_t *a1, uint64_t off) return 1; } -void ctg_rg2ug_gen(ma_utg_t *r_cl, kv_ul_ov_t *u_cl, const ul_idx_t *uref) +void ctg_rg2ug_gen(ma_utg_t *r_cl, kv_ul_ov_t *u_cl, const ul_idx_t *uref, uint64_t id) { if(r_cl->n <= 0) return ; uint64_t k, i, m, l, is_push, vs, ls; ul_ov_t kp, lp, kp0; @@ -16294,11 +16343,20 @@ void ctg_rg2ug_gen(ma_utg_t *r_cl, kv_ul_ov_t *u_cl, const ul_idx_t *uref) if(kp.qs < lp.qs) lp.qs = kp.qs; if(kp.qe > lp.qe) lp.qe = kp.qe; if(kp.ts < lp.ts) lp.ts = kp.ts; if(kp.te > lp.te) lp.te = kp.te; ls += (uint32_t)(r_cl->a[i]); + + // if(id == 123) { + // fprintf(stderr, "-0-[M::%s::]\tutg%.6dl(%c)\tq::[%d, %d)\tt::[%d, %d)\tsc::%d\n", __func__, + // (int32_t)(kp.tn>>1)+1, "+-"[kp.tn&1], kp.qs, kp.qe, kp.qe-kp.qs, kp.ts, kp.te, kp.te-kp.ts, kp.sec); + // } } lp.sec = k - l; kv_push(ul_ov_t, *u_cl, lp); } else if(k - l == 1) { m = get_unique_rctg_aln(uref, r_cl->a[l]>>32, l, ls, r_cl->len, NULL, u_cl, 1); + // if(id == 123) { + // fprintf(stderr, "-1-[M::%s::]\tutg%.6dl(%c)\tq::[%d, %d)\tt::[%d, %d)\tsc::%d\n", __func__, + // (int32_t)(kp.tn>>1)+1, "+-"[kp.tn&1], kp.qs, kp.qe, kp.qe-kp.qs, kp.ts, kp.te, kp.te-kp.ts, kp.sec); + // } } l = k; lp = kp0; ls = vs; @@ -16412,7 +16470,10 @@ int64_t chain_offset) t[i] = 1; i = p[i]; } adjust_rev_tse(&(ch[n_u]), ug->g->seq[ch[n_u].tn].len, &its, &ite); - ch[n_u].ts = its; ch[n_u].te = ite; ch[n_u].sec = (sc>0x3FFFFFFF?0x3FFFFFFF:sc); + ch[n_u].ts = its; ch[n_u].te = ite; + + sc = ((i < 0)?(sc):(sc-f[i])); + ch[n_u].sec = (sc>0x3FFFFFFF?0x3FFFFFFF:sc);///this is calculated incorrectly // ch[n_u].qn += chain_offset; //start idx of read alignment in chain // ch[n_u].tn = k + chain_offset; //end idx of read alignment in chain ch[n_u].qn = k + chain_offset; //end idx of read alignment in chain @@ -16467,7 +16528,7 @@ int64_t bw, double diff_ec_ul, int64_t max_skip, int64_t ulid, Chain_Data* dp, c kv_ul_ov_t *idx = &(ll->lo), *init = &(ll->tk); int64_t max_idx; idx->n = init->n = 0; // gl_rg2ug_gen(rch, idx, uref, 1, 2, ulid); - ctg_rg2ug_gen(rch, idx, uref); + ctg_rg2ug_gen(rch, idx, uref, ulid); // uint32_t k; // fprintf(stderr, "[M::%s] rch->len::%u, rch->n::%u, idx->n::%u\n", __func__, (uint32_t)rch->len, (uint32_t)rch->n, (uint32_t)idx->n); @@ -16493,13 +16554,13 @@ int64_t bw, double diff_ec_ul, int64_t max_skip, int64_t ulid, Chain_Data* dp, c kv_resize(uint64_t, ll->srt.a, gdp->l.n); max_idx = ctg_chain_graph(b->km, uref, uref->ug, &(gdp->l), &(gdp->swap), &(gdp->dst), &(gdp->out), - &(gdp->path), rch->len, uopt, G_CHAIN_BW, N_GCHAIN_RATE, ll->srt.a.a, sps, dp, UG_SKIP_GRAPH_N, UG_ITER_N, /**UG_DIS_N**/UG_DIS_N*100); + &(gdp->path), rch->len, uopt, G_CHAIN_BW, N_GCHAIN_RATE, ll->srt.a.a, sps, dp, UG_SKIP_GRAPH_N, UG_ITER_N, /**UG_DIS_N**/UG_DIS_N*100, ulid); //sps -> idx; (gdp->l) -> alignments if(max_idx >= 0) { max_idx = select_max_ctg_chain(b->km, uref, ulid, sps, &(gdp->l), &(ll->tk), uref->ug->g, &(gdp->dst_done), &(gdp->out), &(gdp->path), &(gdp->swap)); if(max_idx) { - push_ctg_res(res, &(gdp->swap)); + push_ctg_res(res, &(gdp->swap), ulid); return 1; } } @@ -16557,7 +16618,7 @@ uint64_t pick_bubble(uc_block_t *a, uint64_t a_n, bubble_type *bub, uint64_t *r_ return 0; } -uint64_t qry_bub_sc(const ul_idx_t *uref, ma_ug_t *gfa, scaf_res_t *ref_sc, bubble_type *bub, uint64_t bid, uint64_t qsidx, uint64_t qeidx, ul_vec_t *qstr, kv_ul_ov_t *res) +uint64_t qry_bub_sc(const ul_idx_t *uref, ma_ug_t *gfa, scaf_res_t *ref_sc, bubble_type *bub, uint64_t bid, uint64_t qsidx, uint64_t qeidx, ul_vec_t *qstr, uint32_t avoid_id, kv_ul_ov_t *res) { // fprintf(stderr, "qsidx::%lu(utg%.6u%c), qeidx::%lu(utg%.6u%c)\n", qsidx, (qstr->bb.a[qsidx].hid)+1, "lc"[gfa->u.a[qstr->bb.a[qsidx].hid].circ], qeidx, (qstr->bb.a[qeidx].hid)+1, "lc"[gfa->u.a[qstr->bb.a[qeidx].hid].circ]); uint32_t beg, sink, v, w, z, si, ei, qrev = (uint32_t)-1, qv, qw, trev, nn = 0, tn; @@ -16577,6 +16638,7 @@ uint64_t qry_bub_sc(const ul_idx_t *uref, ma_ug_t *gfa, scaf_res_t *ref_sc, bubb for (i = 0; i < a_n; i++) { // fprintf(stderr, "+(0)i::%lu\n", i); if((a[i].u&1) != (v&1)) continue; + if((a[i].u>>1) == avoid_id) continue; q = &(ref_sc->a[a[i].u>>1]); si = ei = a[i].off; assert((q->bb.a[si].hid == (v>>1)) && (q->bb.a[si].rev == (v&1))); // fprintf(stderr, "+(1)i::%lu\n", i); @@ -16603,6 +16665,7 @@ uint64_t qry_bub_sc(const ul_idx_t *uref, ma_ug_t *gfa, scaf_res_t *ref_sc, bubb // fprintf(stderr, "-a_n::%lu\n", a_n); for (i = 0; i < a_n; i++) { if((a[i].u&1) != (v&1)) continue; + if((a[i].u>>1) == avoid_id) continue; q = &(ref_sc->a[a[i].u>>1]); si = ei = a[i].off; assert((q->bb.a[si].hid == (v>>1)) && (q->bb.a[si].rev == (v&1))); for (k = tn = 0; k < q->bb.n; k++) tn += q->bb.a[k].te - q->bb.a[k].ts; @@ -16686,11 +16749,12 @@ uint64_t cc_bub_match(ul_ov_t *ch, int64_t ch_n) { return k; } -uint64_t extract_scaf_res_t(uc_block_t *in, const ul_idx_t *uref, scaf_res_t *ref_sc, kv_ul_ov_t *res) +uint64_t extract_scaf_res_t(uc_block_t *in, const ul_idx_t *uref, scaf_res_t *ref_sc, uint32_t avid, kv_ul_ov_t *res) { utg_rid_dt *a; uint64_t i, a_n, nn, nw; uc_block_t *ou; ul_ov_t p; memset(&p, 0, sizeof(p)); a = get_r_ug_region(uref->r_ug, &a_n, in->hid); for (i = nn = 0; i < a_n; i++) { + if((a[i].u>>1) == avid) continue; ou = &(ref_sc->a[a[i].u>>1].bb.a[a[i].off]); p.qn = (uint32_t)-1; p.qs = in->qs; p.qe = in->qe; p.tn = a[i].u>>1; p.ts = ou->qs; p.te = ou->qe; @@ -16738,7 +16802,7 @@ uint64_t gen_trans_ovlp_scaf(ma_ug_t *gfa, u_trans_t *map, uc_block_t *qin, uc_b return 1; } -void extract_trans_scaf_res_t(uc_block_t *qin, const ul_idx_t *uref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta, kv_ul_ov_t *res) +void extract_trans_scaf_res_t(uc_block_t *qin, const ul_idx_t *uref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta, uint32_t avid, kv_ul_ov_t *res) { utg_rid_dt *a; uint64_t i, a_n; uc_block_t *rin; u_trans_t *u; uint64_t k, u_n; @@ -16751,6 +16815,7 @@ void extract_trans_scaf_res_t(uc_block_t *qin, const ul_idx_t *uref, scaf_res_t // fprintf(stderr, "+[M::%s] utg%.6u%c->utg%.6u%c, q::[%u, %u), %c, t::[%u, %u)\n", __func__, qin->hid+1, "lc"[gfa->u.a[qin->hid].circ], u[k].tn+1, "lc"[gfa->u.a[u[k].tn].circ], u[k].qs, u[k].qe, "+-"[u[k].rev], u[k].ts, u[k].te); a = get_r_ug_region(uref->r_ug, &a_n, u[k].tn); for (i = 0; i < a_n; i++) { + if((a[i].u>>1) == avid) continue; rin = &(ref_sc->a[a[i].u>>1].bb.a[a[i].off]); if(gen_trans_ovlp_scaf(gfa, &(u[k]), qin, rin, &p)) { p.tn = a[i].u>>1; p.qn = u[k].f; p.el = 0; p.sec = u[k].nw; @@ -16762,9 +16827,9 @@ void extract_trans_scaf_res_t(uc_block_t *qin, const ul_idx_t *uref, scaf_res_t } } -void cl_trans_gen(uint32_t qid, uint32_t qlen, ul_vec_t *qstr, kv_ul_ov_t *res, const ul_idx_t *uref, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, bubble_type *bub, kv_u_trans_t *ta) +void cl_trans_gen(uint32_t qid, uint32_t qlen, ul_vec_t *qstr, kv_ul_ov_t *res, const ul_idx_t *uref, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, bubble_type *bub, kv_u_trans_t *ta, uint32_t is_self) { - uint64_t k, l, m = 0, mi, z, zn, a_n, bid, nw; uc_block_t *a; + uint64_t k, l, m = 0, mi, z, zn, a_n, bid, nw; uc_block_t *a; uint32_t avid = ((is_self)?(qid):((uint32_t)-1)); res->n = 0; ///identify bubble chain first for (k = 0, a = qstr->bb.a; k < qstr->bb.n; k++) { @@ -16772,7 +16837,7 @@ void cl_trans_gen(uint32_t qid, uint32_t qlen, ul_vec_t *qstr, kv_ul_ov_t *res, while (z < a_n) { zn = 1; zn = pick_bubble(a + z, a_n - z, bub, &bid); - if((!zn) || (!qry_bub_sc(uref, gfa, ref_sc, bub, bid, z, z + zn, qstr, res))) {///found a bubble; a[z] -> beg; a[z + zn] -> sink + if((!zn) || (!qry_bub_sc(uref, gfa, ref_sc, bub, bid, z, z + zn, qstr, avid, res))) {///found a bubble; a[z] -> beg; a[z + zn] -> sink zn = 1; } z += zn; @@ -16814,10 +16879,10 @@ void cl_trans_gen(uint32_t qid, uint32_t qlen, ul_vec_t *qstr, kv_ul_ov_t *res, if((mi < m) && (z >= res->a[mi].qs) && (z <= res->a[mi].qe)) continue; ///exact match - if(extract_scaf_res_t(&(qstr->bb.a[z]), uref, ref_sc, res)) continue; + if(extract_scaf_res_t(&(qstr->bb.a[z]), uref, ref_sc, avid, res)) continue; ///trans match - extract_trans_scaf_res_t(&(qstr->bb.a[z]), uref, ref_sc, gfa, ta, res); + extract_trans_scaf_res_t(&(qstr->bb.a[z]), uref, ref_sc, gfa, ta, avid, res); } } @@ -16839,33 +16904,468 @@ void cl_trans_gen(uint32_t qid, uint32_t qlen, ul_vec_t *qstr, kv_ul_ov_t *res, // } } -void ctg_trans_gp_chain(kv_ul_ov_t *res, kv_ul_ov_t *buf, const ul_idx_t *uref, const ug_opt_t *uopt, int64_t bw, -double diff_ec_ul, int64_t qlen, Chain_Data* dp) +inline int64_t comput_linear_ctg_trans_sc(ul_ov_t *li, ul_ov_t *lj, double diff_ec_ul, double ovlp_max, int64_t bw) +{ ///li is the suffix of lj + int64_t dq, dt, dd, mm, os, oe; + if(lj->rev != li->rev) return INT32_MIN; + int64_t iqs, iqe, its, ite, jqs, jqe, jts, jte; + iqs = li->qs; iqe = li->qe; + jqs = lj->qs; jqe = lj->qe; + its = li->ts; ite = li->te; + jts = lj->ts; jte = lj->te; + // if(li->rev) { + // its = lj->ts; ite = lj->te; + // jts = li->ts; jte = li->te; + // } + if(jte > ite) return INT32_MIN; + + + + + dq = iqs - jqe; + dt = its - jte; + dd = (dq>dt? dq-dt:dt-dq); + + dq = iqe - jqs; + dt = ite - jts; + mm = MAX(dq, dt); mm *= diff_ec_ul; if(mm < bw) mm = bw; + if(dd > mm) return INT32_MIN; + + ///too long overlap + dd = iqe - iqs; if(dd > jqe - jqs) dd = jqe - jqs; + dd *= ovlp_max; if(dd < bw) dd = bw; + os = MAX(iqs, jqs), oe = MIN(iqe, jqe); + mm = ((oe > os)? (oe - os):0); + if(mm > 0 && mm > dd) return INT32_MIN; + + dd = ite - its; if(dd > jte - jts) dd = jte - jts; + dd *= ovlp_max; if(dd < bw) dd = bw; + os = MAX(its, jts), oe = MIN(ite, jte); + mm = ((oe > os)? (oe - os):0); + if(mm > dd) return INT32_MIN; + + return li->sec; +} + +uint64_t linear_ctg_trans_chain_dp(uint32_t qid, ul_ov_t *ch, int64_t ch_n, const ul_idx_t *uref, const ug_opt_t *uopt, int64_t bw, +double diff_ec_ul, double ovlp_max, double unmatch_max, int64_t qlen, int64_t tlen, int64_t max_skip, int64_t max_iter, int64_t max_dis, Chain_Data* dp, ul_ov_t *res, asg64_v *ea) { - uint64_t k/**, l, z, an, m**/; - for (k = 0; k < res->n; k++) { + memset(res, 0, sizeof((*res))); + if(ch_n == 0) return 0; + int64_t i, j, k, l, sc, csc, se, cse, mm_sc, mm_se, mm_idx, max, max_e, peak_sc, peak_e, peak_i, rts, rte, rqs, rqe; + ul_ov_t *li = NULL, *lj = NULL; int64_t *f, *e, st, plus, max_ii, n_skip, end_j; int32_t *p, *t; + resize_Chain_Data(dp, ch_n, NULL); + e = dp->tmp; f = dp->pre; p = dp->score; t = dp->indels; + + i = 0; + if(i < ch_n) { + ch[i].tn >>= 1; + if(ch[i].rev) { + adjust_rev_tse(&(ch[i]), tlen, &rts, &rte); + ch[i].ts = rts; ch[i].te = rte; + } + } + + for (i = 1, j = 0; i <= ch_n; i++) { + if(i < ch_n) { + ch[i].tn >>= 1; + if(ch[i].rev) { + adjust_rev_tse(&(ch[i]), tlen, &rts, &rte); + ch[i].ts = rts; ch[i].te = rte; + } + } + if (i == ch_n || ch[i].rev != ch[j].rev) { + if(i - j > 1) { + radix_sort_ul_ov_srt_qe(ch+j, ch+i); + for (k = j + 1, l = j; k <= i; k++) { + if(k == i || ch[k].qe != ch[l].qe) { + if(k - l > 1) radix_sort_ul_ov_srt_qs(ch+l, ch+k); + l = k; + } + } + } + j = i; + } + } + + // fprintf(stderr, "[M::%s::] ch_n:%ld\n", __func__, ch_n); + memset(t, 0, (ch_n*sizeof((*t)))); peak_sc = peak_e = peak_i = INT32_MIN; + for (i = st = plus = 0, max_ii = -1; i < ch_n; ++i) { + li = &(ch[i]); csc = li->sec; cse = 0; + if(li->el || li->qn == RC_0 || li->qn == RC_1) cse = li->qe - li->qs; + + mm_sc = csc; mm_se = cse; mm_idx = -1; n_skip = 0; end_j = -1; + st = (i= st; --j) { + lj = &(ch[j]); + sc = comput_linear_ctg_trans_sc(li, lj, diff_ec_ul, ovlp_max, bw); ///should allow contain + if(sc == INT32_MIN) continue; + sc += f[j]; se = cse + e[j]; + if((sc > mm_sc) || (sc == mm_sc && se > mm_se)) { + mm_sc = sc, mm_se = se, mm_idx = j; + if (n_skip > 0) --n_skip; + } else if (t[j] == i) { + if (++n_skip > max_skip) + break; + } + if (p[j] >= 0) t[p[j]] = i; + } + + + end_j = j; + if (max_ii < 0 || (ch[i].qe>(ch[max_ii].qe+max_dis))) {//too long + max = INT32_MIN; max_e = INT32_MIN; max_ii = -1; + for (j = i - 1; (j >= st) && (ch[i].qe<=(max_dis+ch[j].qe)); --j) { + if ((max < f[j]) || (max == f[j] && max_e < e[j])) { + max = f[j], max_e = e[j], max_ii = j; + } + } + } + + if (max_ii >= 0 && max_ii < end_j) {///just have a try with a[i]<->a[max_ii] + lj = &(ch[max_ii]); + sc = comput_linear_ctg_trans_sc(li, lj, diff_ec_ul, ovlp_max, bw); ///should allow contain + if(sc != INT32_MIN) { + sc += f[max_ii]; + se = cse + e[max_ii]; + if((sc > mm_sc) || (sc == mm_sc && se > mm_se)) { + mm_sc = sc; mm_se = se; mm_idx = max_ii; + } + } + } + + f[i] = mm_sc; e[i] = mm_se; p[i] = mm_idx; + if ((max_ii < 0) || ((ch[i].qe<=max_dis+ch[max_ii].qe) && ((f[max_ii] peak_sc) || (f[i] == peak_sc || e[i] > peak_e)) { + peak_sc = f[i]; peak_e = e[i]; peak_i = i; + } + // li->sec = (mm_idx<0?0x3FFFFFFF:i-mm_idx); + // sv[i] = *li; + // fprintf(stderr, "##(%ld) %u\t%u\t%c\tutg%.6d%c(%u)\t%u\t%u\tmm_idx:%ld\tmm_sc:%ld\n", i, li->qs, li->qe, "+-"[li->rev], + // (int32_t)(li->tn)+1, "lc"[uref->ug->u.a[li->tn].circ], uref->ug->u.a[li->tn].len, li->ts, li->te, mm_idx, mm_sc); + // track[i] = push_sc_pre(mm_sc, mm_idx); + // li->sec = (mm_idx<0?0x3FFFFFFF:i-mm_idx); sv[i] = *li; + } + + if(peak_i < 0) return 0; + + uint64_t qs, qgap = 0, ts, tgap = 0, eqs = (uint64_t)-1, eqe = (uint64_t)-1, ea_n = ea->n; + i = peak_i; *res = ch[i]; + res->qn = ((peak_sc>UINT32_MAX)?(UINT32_MAX):(peak_sc)); + res->sec = 0; qs = res->qs; ts = res->ts; + + // memset(er, 0, sizeof((*er))); er->qs = er->qe = er->ts = er->te = (uint32_t)-1; + // er->qn = ((peak_e>UINT32_MAX)?(UINT32_MAX):(peak_e)); + + ///has been sorted by qe && te + while (i >= 0) { + if(ch[i].qe >= qs) { + if(ch[i].qs < qs) qs = ch[i].qs; + } else { + qgap += qs - ch[i].qe; + qs = ch[i].qs; + } + + if(ch[i].te >= ts) { + if(ch[i].ts < ts) ts = ch[i].ts; + } else { + tgap += ts - ch[i].te; + ts = ch[i].ts; + } + + if(ch[i].el || ch[i].qn == RC_0 || ch[i].qn == RC_1) {///reliable + if ((ea->n > ea_n) && (ch[i].qe >= (ea->a[ea->n-1]>>32))) { + if(ch[i].qs < (ea->a[ea->n-1]>>32)) { + eqs = ch[i].qs; eqe = (uint32_t)(ea->a[ea->n-1]); + ea->a[ea->n-1] = eqs << 32; ea->a[ea->n-1] |= eqe; + } + } else { + eqs = ch[i].qs; eqe = ch[i].qe; eqs <<= 32; eqs |= eqe; + kv_push(uint64_t, *ea, eqs); + // ea->a[ea->n-1] = eqs << 32; ea->a[ea->n-1] |= eqe; + } + } + + if(ch[i].qs < res->qs) res->qs = ch[i].qs; + if(ch[i].qe > res->qe) res->qe = ch[i].qe; + if(ch[i].ts < res->ts) res->ts = ch[i].ts; + if(ch[i].te > res->te) res->te = ch[i].te; + // if(qid == 2 && qlen == 112183461 && ch[i].tn == 30 && tlen == 24761445) { + // fprintf(stderr, "-(%ld)[M::%s] h2tg%.6u%c\t%ld\t%u\t%u\t%c\th1tg%.6u%c\t%ld\t%u\t%u\tsec(%u)\tel(%u)\tf(%u)\n", i, __func__, + // qid+1, "lc"[0], qlen, ch[i].qs, ch[i].qe, "+-"[ch[i].rev], ch[i].tn+1, "lc"[0], tlen, ch[i].ts, ch[i].te, ch[i].sec, ch[i].el, ch[i].qn); + // } else if(ch[i].tn == 2 && tlen == 112183461 && ch[i].qn == 30 && qlen == 24761445) { + // fprintf(stderr, "-(%ld)[M::%s] h1tg%.6u%c\t%ld\t%u\t%u\t%c\th2tg%.6u%c\t%ld\t%u\t%u\tsec(%u)\tel(%u)\tf(%u)\n", i, __func__, + // qid+1, "lc"[0], qlen, ch[i].qs, ch[i].qe, "+-"[ch[i].rev], ch[i].tn+1, "lc"[0], tlen, ch[i].ts, ch[i].te, ch[i].sec, ch[i].el, ch[i].qn); + // } + i = p[i]; + } + ////0x3fffffff + // if(qgap > 0x3fffffff || tgap > 0x3fffffff) { + // ea->n = ea_n; return 0; + // } + + adjust_rev_tse(res, tlen, &rts, &rte); + res->ts = rts; res->te = rte; + extend_end_coord(NULL, res, qlen, tlen, &rqs, &rqe, &rts, &rte); + qgap += (rqe - rqs) - (res->qe - res->qs); + tgap += (rte - rts) - (res->te - res->ts); + + // if(qid == 2 && qlen == 112183461 && ch[0].tn == 30 && tlen == 24761445) { + // fprintf(stderr, "+(%ld)[M::%s] h2tg%.6u%c\t%ld\t%ld(%u)\t%ld(%u)\t%c\th1tg%.6u%c\t%ld\t%ld(%u)\t%ld(%u)\tqgap(%ld)\ttgap(%ld)\n", i, __func__, + // qid+1, "lc"[0], qlen, rqs, res->qs, rqe, res->qe, "+-"[ch[0].rev], ch[0].tn+1, "lc"[0], tlen, rts, res->ts, rte, res->te, qgap, tgap); + // } else if(ch[i].tn == 2 && tlen == 112183461 && ch[0].qn == 30 && qlen == 24761445) { + // fprintf(stderr, "+(%ld)[M::%s] h2tg%.6u%c\t%ld\t%ld(%u)\t%ld(%u)\t%c\th1tg%.6u%c\t%ld\t%ld(%u)\t%ld(%u)\tqgap(%ld)\ttgap(%ld)\n", i, __func__, + // qid+1, "lc"[0], qlen, rqs, res->qs, rqe, res->qe, "+-"[ch[0].rev], ch[0].tn+1, "lc"[0], tlen, rts, res->ts, rte, res->te, qgap, tgap); + // } + + // if(qgap > 0x3fffffff || tgap > 0x3fffffff) { + // ea->n = ea_n; return 0; + // } + if((qgap > ((rqe-rqs)*unmatch_max)) || (tgap > ((rte-rts)*unmatch_max))) { + ea->n = ea_n; return 0; + } + + double qrate = ((double)((rqe-rqs)-qgap))/((double)(rqe-rqs)); + double trate = ((double)((rte-rts)-tgap))/((double)(rte-rts)); + double mrate = MIN(qrate, trate); if(mrate < 0) mrate = 0; + res->qn *= mrate; + if(res->qn <= 0) { + ea->n = ea_n; return 0; + } + + res->qs = rqs; res->qe = rqe; res->ts = rts; res->te = rte; + res->sec = (0x3fffffff); + if(ea->n > ea_n) { + uint64_t *a = ea->a + ea_n, an = ea->n - ea_n, ai, an1 = an>>1; + for(ai = 0; ai < an1; ai++) { + qs = a[ai]; a[ai] = a[an - ai - 1]; a[an - ai - 1] = qs; + } + res->sec = ea->n; kv_push(uint64_t, *ea, an); + } + + return 1; +} + +void ctg_trans_gp_chain(uint32_t qid, kv_ul_ov_t *res, const ul_idx_t *uref, const ug_opt_t *uopt, int64_t bw, double diff_ec_ul, int64_t qlen, Chain_Data* dp, ma_ug_t *ref, uint32_t is_self, asg64_v *b) +{ + uint64_t k, l, m = 0, rn = res->n; ul_ov_t rr; uint32_t avid = ((is_self)?(qid):((uint32_t)-1)); + for (k = 0; k < rn; k++) { res->a[k].tn <<= 1; res->a[k].tn |= res->a[k].rev; } - radix_sort_ul_ov_srt_tn(res->a, res->a + res->n); - - + radix_sort_ul_ov_srt_tn(res->a, res->a + rn); + + for (k = 1, l = 0, b->n = 0; k <= rn; k++) { + if((k == rn) || ((res->a[l].tn>>1) != (res->a[k].tn>>1))) { + // fprintf(stderr, "+[M::%s]\tutg%.6ul\n", __func__, (res->a[l].tn>>1) + 1); + if(((res->a[l].tn>>1) != avid) && (linear_ctg_trans_chain_dp(qid, res->a+l, k-l, uref, uopt, bw, diff_ec_ul, /**0.333333**/0.4, /**0.333333**/0.666666, qlen, ref->u.a[(res->a[l].tn>>1)].len, UG_SKIP_N, UG_ITER_N, UG_DIS_N, dp, &rr, b))) { + res->a[m++] = rr; + // fprintf(stderr, "-[M::%s]\tutg%.6ul\n", __func__, rr.tn + 1); + } + l = k; + } + } + res->n = m; + + // for (k = 0; k < res->n; k++) { + // fprintf(stderr, "[M::%s]\tutg%.6ul(len::%ld)\tq::[%u, %u)\t%c\tutg%.6ul(len::%u)\tt::[%u, %u)\tel::%u\ttp::%u\n", __func__, qid + 1, qlen, res->a[k].qs, res->a[k].qe, "+-"[res->a[k].rev], res->a[k].tn+1, ref->u.a[res->a[k].tn].len, res->a[k].ts, res->a[k].te, res->a[k].el, res->a[k].qn); + // } +} + + +uint64_t cal_reliable_ovlp(ul_ov_t *z, asg64_v *in, uint64_t s, uint64_t e) +{ + if(z->sec >= in->n) return 0; + uint64_t k, zs, ze, os, oe, ovq, l = 0; + for (k = z->sec - in->a[z->sec]; k < z->sec; k++) { + zs = (in->a[k]>>32); ze = ((uint32_t)in->a[k]); + if(zs >= e) break; + if(ze <= s) continue; + os = MAX(s, zs); oe = MIN(e, ze); + ovq = ((oe > os)? (oe - os):0); + l += ovq; + } + return l; +} + + +uint64_t filter_ctg_trans(uint32_t qid, ma_ug_t *qry, ma_ug_t *ref, kv_ul_ov_t *aln, asg64_v *aln_e, asg64_v *srt, double reliable_rate, uint64_t reliable_len, double sec_rate, uint64_t sec_bd, double sec_sc, uint64_t soff, uint32_t is_self) +{ + if(aln->n <= 0) return 0; + + uint64_t k, an, *ss, *hs, *bu, *wu, clen, m, wm, z, l, wl, le, os, oe, ovq; ul_ov_t *cz, *mz; uint32_t sce[2]; sce[0] = 0; sce[1] = 1; + for (k = 0; k < aln->n; k++) { + aln->a[k].el = 0; clen = 0; + if((!is_self) || (sce[(qida[k].tna[k].sec < aln_e->n) {///has reliable regions + for (z = aln->a[k].sec - aln_e->a[aln->a[k].sec]; z < aln->a[k].sec; z++) { + clen += ((uint32_t)aln_e->a[z]) - (aln_e->a[z]>>32); + } + } + if((clen > reliable_len) && (clen > ((aln->a[k].qe - aln->a[k].qs)*reliable_rate))) { + aln->a[k].el = 1; + } + } + aln->a[k].qn = ((uint32_t)-1) - aln->a[k].qn; + + // if(qid == 2 && qry->u.a[qid].len == 112183461) {///h2tg000003l + // fprintf(stderr, "+[M::%s] h2tg%.6u%c\t%u\t%u\t%u\t%c\th1tg%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } else if(qid == 30 && qry->u.a[qid].len == 24761445) {///h1tg000031l + // fprintf(stderr, "+[M::%s] h1tg%.6u%c\t%u\t%u\t%u\t%c\th12g%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } else if(aln->a[k].tn == 2 && ref->u.a[aln->a[k].tn].len == 112183461) {///h2tg000003l + // fprintf(stderr, "+[M::%s] h1tg%.6u%c\t%u\t%u\t%u\t%c\th12g%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } else if(aln->a[k].tn == 30 && ref->u.a[aln->a[k].tn].len == 24761445) { + // fprintf(stderr, "+[M::%s] h2tg%.6u%c\t%u\t%u\t%u\t%c\th1tg%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } + } + if(aln->n > 1) radix_sort_ul_ov_srt_qn(aln->a, aln->a + aln->n);///qn->score + for (k = 0; k < aln->n; k++) aln->a[k].qn = ((uint32_t)-1) - aln->a[k].qn; + + an = aln->n; an <<= 2; + srt->n = 0; kv_resize(uint64_t, *srt, an); + for (k = 0; k < aln->n; k++) { + srt->a[srt->n] = aln->a[k].qs; + srt->a[srt->n] <<= 32; + srt->a[srt->n] |= k; + srt->n++; + } + + ss = srt->a; hs = ss + srt->n; bu = hs + srt->n; wu = bu + srt->n; + radix_sort_gfa64(ss, ss + aln->n); + for (k = 0; k < aln->n; k++) { + ss[k] = (uint32_t)ss[k]; hs[ss[k]] = k; + } + for (k = 0; k < aln->n; k++) { + if(aln->a[k].el) continue; + cz = &(aln->a[k]); clen = (cz->qe-cz->qs) * sec_rate; m = wm = 0; + + for (z = l = wl = le = 0; z < aln->n; z++) { + if(z == hs[k]) { + assert(ss[z] == k); + continue; + } + if(ss[z] > k) continue;///smaller nw than a[k] + mz = &(aln->a[ss[z]]); + if(mz->qs >= cz->qe) break; + if(mz->tn == ((uint32_t)-1)) continue;///has been deleted + if((!(mz->el)) && (cz->qn > (mz->qn*0.9))) continue; + os = MAX(cz->qs, mz->qs); oe = MIN(cz->qe, mz->qe); + ovq = ((oe > os)? (oe - os):0); + if(!ovq) continue; + // if(is_connect_arc(cz, mz, s->ug, 0.04) || is_connect_arc(mz, cz, s->ug, 0.04)) continue; + + if((m > 0) && (((uint32_t)bu[m-1]) >= os)) { + if(oe > ((uint32_t)bu[m-1])) { + le += cal_reliable_ovlp(cz, aln_e, ((uint32_t)bu[m-1]), oe); + l += (oe - ((uint32_t)bu[m-1])); + bu[m-1] += (oe - ((uint32_t)bu[m-1])); + } + } else { + le += cal_reliable_ovlp(cz, aln_e, os, oe); + l += oe - os; + bu[m] = os; bu[m] <<= 32; bu[m] |= oe; m++; + } + + if((wm > 0) && (((uint32_t)wu[wm-1]) >= mz->qs)) { + if(mz->qe > ((uint32_t)wu[wm-1])) { + wl += (mz->qe - ((uint32_t)wu[wm-1])); + wu[wm-1] += (mz->qe - ((uint32_t)wu[wm-1])); + } + } else { + wl += mz->qe - mz->qs; + wu[wm] = mz->qs; wu[wm] <<= 32; wu[wm] |= mz->qe; wm++; + } + + if((l > 0) && (l > sec_bd) && (le <= l*0.5)) { + if((l>=clen) || (l>=(wl*sec_rate))) { + ///sc is small enough + if(cz->qn <= (mz->qn*sec_sc)) { + cz->tn = ((uint32_t)-1); + break; + } + } + } + } + } + + for (k = m = 0; k < aln->n; k++) { + if(aln->a[k].tn == ((uint32_t)-1)) continue; + // if(qid == 2 && qry->u.a[qid].len == 112183461) {///h2tg000003l + // fprintf(stderr, "-[M::%s] h2tg%.6u%c\t%u\t%u\t%u\t%c\th1tg%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } else if(qid == 30 && qry->u.a[qid].len == 24761445) {///h1tg000031l + // fprintf(stderr, "-[M::%s] h1tg%.6u%c\t%u\t%u\t%u\t%c\th12g%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } else if(aln->a[k].tn == 2 && ref->u.a[aln->a[k].tn].len == 112183461) {///h2tg000003l + // fprintf(stderr, "-[M::%s] h1tg%.6u%c\t%u\t%u\t%u\t%c\th12g%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } else if(aln->a[k].tn == 30 && ref->u.a[aln->a[k].tn].len == 24761445) { + // fprintf(stderr, "-[M::%s] h2tg%.6u%c\t%u\t%u\t%u\t%c\th1tg%.6u%c\t%u\t%u\t%u\tsec(%u)\tel(%u)\n", __func__, + // qid+1, "lc"[qry->u.a[qid].circ], qry->u.a[qid].len, aln->a[k].qs, aln->a[k].qe, "+-"[aln->a[k].rev], + // aln->a[k].tn+1, "lc"[ref->u.a[aln->a[k].tn].circ], ref->u.a[aln->a[k].tn].len, aln->a[k].ts, aln->a[k].te, aln->a[k].sec, aln->a[k].el); + // } + aln->a[m++] = aln->a[k]; + } + aln->n = m; + // radix_sort_ul_ov_srt_qs(aln->a, aln->a + aln->n); + return aln->n; } +void push_ctg_trans_res(uint32_t id, kv_ul_ov_t *in, kv_ul_ov_t *ou) +{ + uint32_t k; + kv_resize(ul_ov_t, *ou, ou->n + in->n); + for (k = 0; k < in->n; k++) { + in->a[k].sec = ((in->a[k].qn>=(0x3fffffff))?(0x3fffffff):(in->a[k].qn)); + in->a[k].qn = id; kv_push(ul_ov_t, *ou, in->a[k]); + } +} uint32_t direct_ctg_trans_chain(mg_tbuf_t *b, uint32_t id, glchain_t *ll, gdpchain_t *gdp, st_mt_t *sps, haplotype_evdience_alloc *hap, const ul_idx_t *uref, const ug_opt_t *uopt, -int64_t bw, double diff_ec_ul, int64_t max_skip, int64_t ulid, Chain_Data* dp, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, bubble_type *bub, kv_u_trans_t *ta, const asg_t *rg) +int64_t bw, double diff_ec_ul, int64_t max_skip, int64_t ulid, Chain_Data* dp, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, bubble_type *bub, kv_u_trans_t *ta, const asg_t *rg, uint64_t soff) { // res->bb.n = 0; // if(ulid != 86660) return 0; kv_ul_ov_t *idx = &(ll->lo), *init = &(ll->tk); ///int64_t max_idx; - idx->n = init->n = 0; - cl_trans_gen(id, qry->u.a[id].len, &(qry_sc->a[id]), idx, uref, ref, ref_sc, gfa, bub, ta); + asg64_v b0, b1; uint32_t is_self = (qry?0:1); idx->n = 0; + + cl_trans_gen(id, qry?qry->u.a[id].len:ref->u.a[id].len, qry_sc?&(qry_sc->a[id]):&(ref_sc->a[id]), idx, uref, ref, ref_sc, gfa, bub, ta, is_self); if(idx->n == 0) return 0; - ctg_trans_gp_chain(idx, init, uref, uopt, bw, diff_ec_ul, qry->u.a[id].len, dp); + copy_asg_arr(b0, (*sps)); + ctg_trans_gp_chain(id, idx, uref, uopt, bw, diff_ec_ul, qry?qry->u.a[id].len:ref->u.a[id].len, dp, ref, is_self, &b0); + copy_asg_arr((*sps), b0); + if(idx->n == 0) return 0; + copy_asg_arr(b0, (*sps)); copy_asg_arr(b1, ll->srt.a); + filter_ctg_trans(id, qry, ref, idx, &b0, &b1, 0.5, 10000, /**0.333333**/0.85, 10000, 0.3, soff, is_self); + copy_asg_arr((*sps), b0); copy_asg_arr(ll->srt.a, b1); if(idx->n == 0) return 0; + push_ctg_trans_res(id, idx, init); /** // gl_rg2ug_gen(rch, idx, uref, 1, 2, ulid); @@ -16971,7 +17471,154 @@ static void worker_for_ctg_trans_alignment(void *data, long i, int tid) s->hab[tid]->num_read_base++; s->hab[tid]->num_correct_base += direct_ctg_trans_chain(s->buf[tid], i, &(s->ll[tid]), &(s->gdp[tid]), &(s->sps[tid]), &(s->hab[tid]->hap), s->uu, s->uopt, G_CHAIN_BW, s->opt->diff_ec_ul, UG_SKIP, i, &(s->hab[tid]->clist.chainDP), - c->qry, c->qry_sc, c->ref, c->ref_sc, c->gfa, c->bub, c->ta, s->rg); + c->qry, c->qry_sc, c->ref, c->ref_sc, c->gfa, c->bub, c->ta, s->rg, c->soff); +} + +void rm_dup_aln(u_trans_t *u, uint64_t u_n, asg64_v *b, double dup_cut) +{ + if(u_n <= 0) return; + uint64_t k, m, s, e, dp, old_dp, z, zs, ze, l, os, oe, ovq, cut; u_trans_t t; + b->n = 0; + for (k = 0; k < u_n; k++) { + s = u[k].qs; e = u[k].qe; u[k].occ = 0; + kv_push(uint64_t, *b, (s<<1)); + kv_push(uint64_t, *b, (e<<1|1)); + } + radix_sort_gfa64(b->a, b->a + b->n); + + for (k = dp = old_dp = m = 0, s = e = (uint64_t)-1; k < b->n; k++) { + old_dp = dp; + //if a[j] is qe + if (b->a[k]&1) --dp; + else ++dp; + + if (old_dp < 2 && dp >= 2) {///old_dp < dp, b.a[k] is qs + s = b->a[k]>>1; + } else if (old_dp >= 2 && dp < 2) {///old_dp > min_dp, b.a[k] is qe + e = b->a[k]>>1; + if(e > s) b->a[m++] = (s<<32)|e;///non-unique region + } + } + + b->n = m; + for (k = m = 0; k < u_n; k++) { + l = 0; s = u[k].qs; e = u[k].qe; + cut = (e - s)*dup_cut; + if(!(u[k].f)) {//unreliable + for (z = 0; z < b->n; z++) { + zs = (b->a[z]>>32); ze = ((uint32_t)b->a[z]); + if(zs >= e) break; + if(ze <= s) continue; + os = MAX(s, zs); oe = MIN(e, ze); + ovq = ((oe > os)? (oe - os):0); + l += ovq; + if(l > cut) break; + } + } + if(((l <= 0) || (l <= cut)) && (l < (e - s))) { + if(m != k) { t = u[m]; u[m] = u[k]; u[k] = t;} + m++; + } + } + + for (k = m; k < u_n; k++) u[k].del = 1;///delete + + for (k = b->n = 0; k < m; k++) { + dp = u[k].nw; dp <<= 32; dp |= k; kv_push(uint64_t, *b, dp); + } + radix_sort_gfa64(b->a, b->a + b->n); old_dp = 0; + for (k = 0; k < b->n; k++) { + assert(!u[((uint32_t)b->a[k])].del); + for (z = 0; z < b->n; z++) { + if(k == z) continue; + if(u[((uint32_t)b->a[z])].del) continue; + if(u[((uint32_t)b->a[k])].qs >= u[((uint32_t)b->a[z])].qs && u[((uint32_t)b->a[k])].qe <= u[((uint32_t)b->a[z])].qe) break; + } + if(z < b->n) { + u[((uint32_t)b->a[k])].del = 1; old_dp++; + } + } + + if(old_dp) { + for (k = m = 0; k < u_n; k++) { + if(u[k].del) continue; + if(m != k) { t = u[m]; u[m] = u[k]; u[k] = t;} + m++; + } + } + + + ///for debug is there is any contained overlaps in u[0, m) + for (k = 0; k < m; k++) { + for (z = 0; z < m; z++) { + if(k == z) continue; + if(u[k].qs >= u[z].qs && u[k].qe <= u[z].qe) break; + } + assert(z >= m); + } +} + +static void worker_for_ctg_trans_order(void *data, long i, int tid) +{ + ctdat_t *c = (ctdat_t*)data; utepdat_t *s = c->s; asg64_v b; + u_trans_t *a = NULL, *r_a = NULL, t; uint64_t n, r_n, k, z, m; + + if(c->mm == 0) { + copy_asg_arr(b, (s->sps[tid])); + rm_dup_aln(u_trans_a(*(c->ta), i), u_trans_n(*(c->ta), i), &b, 0.25); + copy_asg_arr((s->sps[tid]), b); + } else if(c->mm == 1) { + a = u_trans_a(*(c->ta), i); n = u_trans_n(*(c->ta), i); + for (k = 0; k < n; k++) { + if(a[k].tn < a[k].qn) continue; + r_a = u_trans_a(*(c->ta), a[k].tn); r_n = u_trans_n(*(c->ta), a[k].tn); + for(z = 0; (z < r_n) && (r_a[z].tn != a[k].qn); z++); + assert(z < r_n); + // if(a[k].del || r_a[z].del) { + // a[k].del = r_a[z].del = 1; + // } + ///discard too short overlaps + if((!a[k].f) || (!r_a[z].f)) { + if((a[k].qe - a[k].qs <= asm_opt.self_scaf_min) || (a[k].te - a[k].ts <= asm_opt.self_scaf_min)) { + a[k].del = r_a[z].del = 1; + } else if((r_a[z].qe - r_a[z].qs <= asm_opt.self_scaf_min) || (r_a[z].te - r_a[z].ts <= asm_opt.self_scaf_min)) { + a[k].del = r_a[z].del = 1; + } + } + + if(a[k].del != r_a[z].del) { + a[k].del = r_a[z].del = 0; + a[k].occ = r_a[z].occ = (uint32_t)-1; + } + } + } else { + a = u_trans_a(*(c->ta), i); n = u_trans_n(*(c->ta), i); + for (k = m = 0; k < n; k++) { + if((a[k].occ != (uint32_t)-1)) { + if(m != k) {t = a[k]; a[k] = a[m]; a[m] = t;} + m++; + } + } + radix_sort_u_trans_qs0(a, a + m); + radix_sort_u_trans_qs0(a + m, a + n); + a = a + m; n = n - m; + if(n) {//sort and merge unreliable regions + for (k = 0; k < n; k++) a[k].del = 1; + for (k = m = 0; k < n; k++) { + if((m > 0) && (a[m-1].qe >= a[k].qs)) { + if(a[k].qe > a[m-1].qe) a[m-1].qe = a[k].qe; + } else { + a[m++] = a[k]; + } + } + + for(k = m; k < n; k++) {///not useful anymore + a[k].qn = a[k].qs = a[k].qe = (uint32_t)-1; + a[k].tn = a[k].ts = a[k].te = (uint32_t)-1; + } + } + } + } uint32_t inline is_rg_connect(const asg_t *rg, uint32_t v, uint32_t w) @@ -18251,9 +18898,51 @@ scaf_res_t *work_ctg_path_gchains(uldat_t *sl) } -kv_u_trans_t *work_ctg_path_trans(uldat_t *sl, asg_t *sg, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta) +void fill_ctg_path_trans_idx(glchain_t *a, uint64_t n, uint64_t qn, uint32_t qoff, uint32_t toff, kv_u_trans_t *ou) +{ + kv_resize(uint64_t, ou->idx, qn); + memset(ou->idx.a, -1, sizeof((*(ou->idx.a)))*qn); + + uint64_t i, k, l, pn; kv_ul_ov_t *s = NULL; u_trans_t *z = NULL; + for(i = pn = 0; i < n; i++) { + s = &(a[i].tk); pn += s->n; + for (k = 1, l = 0; k <= s->n; k++) { + if(k == s->n || s->a[l].qn != s->a[k].qn) { + assert(ou->idx.a[s->a[l].qn] == (uint64_t)-1); + ou->idx.a[s->a[l].qn] = (i<<32)|l; + l = k; + } + } + } + kv_resize(u_trans_t, *ou, ou->n+(pn<<1)); l = ou->n; + + for (i = 0; i < qn; i++) { + if(ou->idx.a[i] == (uint64_t)-1) { + ou->idx.a[i] = 0; continue; + } + s = &(a[ou->idx.a[i]>>32].tk); + for(k = l = (uint32_t)ou->idx.a[i]; k < s->n && s->a[l].qn == s->a[k].qn; k++) { + kv_pushp(u_trans_t, *ou, &z); + z->qn = s->a[k].qn+qoff; z->qs = s->a[k].qs; z->qe = s->a[k].qe; + z->tn = s->a[k].tn+toff; z->ts = s->a[k].ts; z->te = s->a[k].te; + z->rev = s->a[k].rev; z->f = s->a[k].el; z->nw = s->a[k].sec; + z->occ = s->a[k].sec; z->del = 0; + + + kv_pushp(u_trans_t, *ou, &z); + z->tn = s->a[k].qn+qoff; z->ts = s->a[k].qs; z->te = s->a[k].qe; + z->qn = s->a[k].tn+toff; z->qs = s->a[k].ts; z->qe = s->a[k].te; + z->rev = s->a[k].rev; z->f = s->a[k].el; z->nw = s->a[k].sec; + z->occ = s->a[k].sec; z->del = 0; + + } + // p->idx.a[i] = pn - (k - l); p->idx.a[i] <<= 32; p->idx.a[i] |= k - l; ///pn += k - l; + } + // assert(l + (pn<<1) == ou->n); +} + +void work_ctg_path_trans(uldat_t *sl, asg_t *sg, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta, uint32_t qoff, uint32_t toff, bubble_type *bu, kv_u_trans_t *res) { - kv_u_trans_t *res = NULL; CALLOC(res, 1); utepdat_t s; uint64_t i; memset(&s, 0, sizeof(s)); s.id = 0; s.opt = sl->opt; s.ug = sl->ug; s.uopt = sl->uopt; s.rg = sl->rg; s.uu = sl->uu; CALLOC(s.hab, sl->n_thread); CALLOC(s.buf, sl->n_thread); CALLOC(s.ll, sl->n_thread); @@ -18263,7 +18952,7 @@ kv_u_trans_t *work_ctg_path_trans(uldat_t *sl, asg_t *sg, ma_ug_t *qry, scaf_res s.hab[i] = ha_ovec_init(0, 0, 1); s.buf[i] = mg_tbuf_init(); } - ctdat_t c; memset(&c, 0, sizeof(c)); uint8_t *bf = NULL; c.bub = gen_bubble_chain(sg, gfa, (ug_opt_t *)(sl->uopt), &bf, 0); free(bf); + ctdat_t c; memset(&c, 0, sizeof(c)); c.soff = (uint64_t)-1; /**uint8_t *bf = NULL; c.bub = gen_bubble_chain(sg, gfa, (ug_opt_t *)(sl->uopt), &bf, 0); free(bf);**/c.bub = bu; c.s = &(s); c.qry = qry; c.qry_sc = qry_sc; c.ref = ref; c.ref_sc = ref_sc; c.gfa = gfa; c.ta = ta; // detect_outlier_len("+++work_ul_gchains"); @@ -18271,7 +18960,38 @@ kv_u_trans_t *work_ctg_path_trans(uldat_t *sl, asg_t *sg, ma_ug_t *qry, scaf_res kt_for(sl->n_thread, worker_for_ctg_trans_alignment, &c, c.qry_sc->n); // detect_outlier_len("---work_ul_gchains"); - destory_bubbles(c.bub); free(c.bub); + /**destory_bubbles(c.bub); free(c.bub);**/ + fill_ctg_path_trans_idx(s.ll, sl->n_thread, qry->u.n, qoff, toff, res); + for (i = 0; i < sl->n_thread; ++i) { + s.sum_len += s.hab[i]->num_read_base; s.n += s.hab[i]->num_correct_base; + ha_ovec_destroy(s.hab[i]); mg_tbuf_destroy(s.buf[i]); hc_glchain_destroy(&(s.ll[i])); + hc_gdpchain_destroy(&(s.gdp[i])); kv_destroy(s.mzs[i]); kv_destroy(s.sps[i]); + } + + free(s.hab); free(s.buf); free(s.ll); free(s.gdp); free(s.mzs); free(s.sps); + fprintf(stderr, "[M::%s::] # try:%d, # done:%d\n", __func__, s.sum_len, s.n); +} + +void work_ctg_path_trans_self(uldat_t *sl, asg_t *sg, ma_ug_t *db, scaf_res_t *db_sc, ma_ug_t *gfa, kv_u_trans_t *ta, uint64_t soff, bubble_type *bu, kv_u_trans_t *res) +{ + utepdat_t s; uint64_t i; memset(&s, 0, sizeof(s)); + s.id = 0; s.opt = sl->opt; s.ug = sl->ug; s.uopt = sl->uopt; s.rg = sl->rg; s.uu = sl->uu; + CALLOC(s.hab, sl->n_thread); CALLOC(s.buf, sl->n_thread); CALLOC(s.ll, sl->n_thread); + CALLOC(s.gdp, sl->n_thread); CALLOC(s.mzs, sl->n_thread); CALLOC(s.sps, sl->n_thread); + + for (i = 0; i < sl->n_thread; ++i) { + s.hab[i] = ha_ovec_init(0, 0, 1); s.buf[i] = mg_tbuf_init(); + } + + ctdat_t c; memset(&c, 0, sizeof(c)); c.soff = soff; c.bub = bu; + c.s = &(s); c.qry = NULL; c.qry_sc = NULL; c.ref = db; c.ref_sc = db_sc; c.gfa = gfa; c.ta = ta; + + // detect_outlier_len("+++work_ul_gchains"); + + kt_for(sl->n_thread, worker_for_ctg_trans_alignment, &c, db_sc->n); + + // detect_outlier_len("---work_ul_gchains"); + fill_ctg_path_trans_idx(s.ll, sl->n_thread, db->u.n, 0, 0, res); for (i = 0; i < sl->n_thread; ++i) { s.sum_len += s.hab[i]->num_read_base; s.n += s.hab[i]->num_correct_base; ha_ovec_destroy(s.hab[i]); mg_tbuf_destroy(s.buf[i]); hc_glchain_destroy(&(s.ll[i])); @@ -18280,7 +19000,6 @@ kv_u_trans_t *work_ctg_path_trans(uldat_t *sl, asg_t *sg, ma_ug_t *qry, scaf_res free(s.hab); free(s.buf); free(s.ll); free(s.gdp); free(s.mzs); free(s.sps); fprintf(stderr, "[M::%s::] # try:%d, # done:%d\n", __func__, s.sum_len, s.n); - return res; } @@ -21199,7 +21918,7 @@ scaf_res_t *gen_contig_path(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *ctg, ma_ug return res; } -kv_u_trans_t *gen_contig_trans(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta) +void gen_contig_trans(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta, uint32_t qoff, uint32_t toff, bubble_type *bu, kv_u_trans_t *res) { mg_idxopt_t opt; uldat_t sl; int32_t cutoff; init_aux_table(); ha_opt_update_cov(&asm_opt, asm_opt.hom_cov); @@ -21208,9 +21927,55 @@ kv_u_trans_t *gen_contig_trans(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *qry, sc ul_idx_t *uu = gen_ul_idx_t_sc(ref, sg, ref_sc, gfa->u.n); init_uldat_t(&sl, NULL, NULL, &opt, CHUNK_SIZE, asm_opt.thread_num, uopt, uu); sl.rg = sg; sl.ug = qry; - kv_u_trans_t *res = work_ctg_path_trans(&sl, sg, qry, qry_sc, ref, ref_sc, gfa, ta); uu->ug = NULL; + work_ctg_path_trans(&sl, sg, qry, qry_sc, ref, ref_sc, gfa, ta, qoff, toff, bu, res); uu->ug = NULL; destroy_ul_idx_t(uu); - return res; +} + +void gen_contig_self(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *db, scaf_res_t *db_sc, ma_ug_t *gfa, kv_u_trans_t *ta, uint64_t soff, bubble_type *bu, kv_u_trans_t *res) +{ + mg_idxopt_t opt; uldat_t sl; int32_t cutoff; + init_aux_table(); ha_opt_update_cov(&asm_opt, asm_opt.hom_cov); + cutoff = asm_opt.max_n_chain; + init_mg_opt(&opt, !(asm_opt.flag&HA_F_NO_HPC), 19, 10, cutoff, asm_opt.max_n_chain, asm_opt.ul_error_rate, asm_opt.ul_error_rate, asm_opt.ul_error_rate_low, asm_opt.ul_error_rate_hpc, asm_opt.ul_ec_round); + ul_idx_t *uu = gen_ul_idx_t_sc(db, sg, db_sc, gfa->u.n); + init_uldat_t(&sl, NULL, NULL, &opt, CHUNK_SIZE, asm_opt.thread_num, uopt, uu); sl.rg = sg; sl.ug = db; + + work_ctg_path_trans_self(&sl, sg, db, db_sc, gfa, ta, soff, bu, res); uu->ug = NULL; + destroy_ul_idx_t(uu); +} + +void order_contig_trans(kv_u_trans_t *in) +{ + utepdat_t s; uint64_t i, n_th = asm_opt.thread_num; memset(&s, 0, sizeof(s)); + CALLOC(s.hab, n_th); CALLOC(s.buf, n_th); CALLOC(s.ll, n_th); + CALLOC(s.gdp, n_th); CALLOC(s.mzs, n_th); CALLOC(s.sps, n_th); + + for (i = 0; i < n_th; ++i) { + s.hab[i] = ha_ovec_init(0, 0, 1); s.buf[i] = mg_tbuf_init(); + } + + ctdat_t c; memset(&c, 0, sizeof(c)); c.s = &(s); c.ta = in; + + c.mm = 0; kt_for(n_th, worker_for_ctg_trans_order, &c, in->idx.n); + c.mm = 1; kt_for(n_th, worker_for_ctg_trans_order, &c, in->idx.n); + + for (i = 0; i < n_th; ++i) { + s.sum_len += s.hab[i]->num_read_base; s.n += s.hab[i]->num_correct_base; + ha_ovec_destroy(s.hab[i]); mg_tbuf_destroy(s.buf[i]); hc_glchain_destroy(&(s.ll[i])); + hc_gdpchain_destroy(&(s.gdp[i])); kv_destroy(s.mzs[i]); kv_destroy(s.sps[i]); + } + + free(s.hab); free(s.buf); free(s.ll); free(s.gdp); free(s.mzs); free(s.sps); + fprintf(stderr, "[M::%s::] # try:%d, # done:%d\n", __func__, s.sum_len, s.n); + + uint64_t m; + for (i = m = 0; i < in->n; i++) { + if(in->a[i].del) continue; + in->a[m++] = in->a[i]; + } + in->n = m; + kt_u_trans_t_idx(in, in->idx.n); + c.mm = 2; kt_for(n_th, worker_for_ctg_trans_order, &c, in->idx.n); } diff --git a/inter.h b/inter.h index e7e180e..dbf7d51 100644 --- a/inter.h +++ b/inter.h @@ -128,6 +128,8 @@ uint32_t clean_contain_g(const ug_opt_t *uopt, asg_t *sg, uint32_t push_trans); void dedup_contain_g(const ug_opt_t *uopt, asg_t *sg); void trans_base_mmhap_infer(ma_ug_t *ug, asg_t *sg, ug_opt_t *uopt, kv_u_trans_t *res); scaf_res_t *gen_contig_path(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *ctg, ma_ug_t *ref); -kv_u_trans_t *gen_contig_trans(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta); +void gen_contig_trans(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *qry, scaf_res_t *qry_sc, ma_ug_t *ref, scaf_res_t *ref_sc, ma_ug_t *gfa, kv_u_trans_t *ta, uint32_t qoff, uint32_t toff, bubble_type *bu, kv_u_trans_t *res); +void gen_contig_self(const ug_opt_t *uopt, asg_t *sg, ma_ug_t *db, scaf_res_t *db_sc, ma_ug_t *gfa, kv_u_trans_t *ta, uint64_t soff, bubble_type *bu, kv_u_trans_t *res); +void order_contig_trans(kv_u_trans_t *in); #endif