From c48a7ad41f6550a47f88ffc03450a44ebe12d33a Mon Sep 17 00:00:00 2001 From: davidebolo1993 Date: Wed, 29 Jan 2025 17:11:08 +0100 Subject: [PATCH] refactoring of dbscan clustering, mod odgi untangle command, output cluster distances (norm and not), fix odgi mask --- .DS_Store | Bin 10244 -> 10244 bytes cosigt_smk/rulegraph.pdf | Bin 0 -> 20019 bytes cosigt_smk/workflow/Snakefile | 8 +- cosigt_smk/workflow/rules/bedtools.smk | 30 +++- cosigt_smk/workflow/rules/cosigt.smk | 20 +-- cosigt_smk/workflow/rules/odgi.smk | 57 ++++---- cosigt_smk/workflow/rules/samtools.smk | 4 +- cosigt_smk/workflow/scripts/annotate.r | 20 ++- cosigt_smk/workflow/scripts/cluster.r | 125 +++++----------- cosigt_smk/workflow/scripts/cluster_dbscan.r | 77 ---------- cosigt_smk/workflow/scripts/cluster_old.r | 145 +++++++++++++++++++ cosigt_smk/workflow/scripts/filter.r | 16 +- cosigt_smk/workflow/scripts/outliers.r | 42 ++++++ 13 files changed, 326 insertions(+), 218 deletions(-) create mode 100644 cosigt_smk/rulegraph.pdf delete mode 100644 cosigt_smk/workflow/scripts/cluster_dbscan.r create mode 100644 cosigt_smk/workflow/scripts/cluster_old.r create mode 100644 cosigt_smk/workflow/scripts/outliers.r diff --git a/.DS_Store b/.DS_Store index f3a4dd28c8cabfa39d8281e0ea9e001267c18d3c..3c968af14502caa1e2d0c3b45ee2829040c8e949 100644 GIT binary patch delta 39 vcmZn(XbISmA;_6hoSc)CpP$1xIY3Z-^J>8uzKIROo7ok9vuqX>Wnu;Z8w(8q delta 76 zcmZn(XbISmAvn2Hkl&Dt!G$58A(!(T)ihzKC zUKC*EWa99dS{XQ*2%8w$8k<1z@j*E{IhYt&L%C*j=!`~@G$pPLs_oWVP6ImeeF*eN zj9~P;p`)==0`)N}5?W1Ul2_wMN165bA2K|KL&nN$b2E>8v-crI;{-&t*V#4RH+Q=h zb>FY}@_!uxp1hvkwqCx@H}o4VJmYury)JIG_2XY|z+b?3xu3vYd_0d;t<#=k%=X}F zC3zd#A6@?Zu!ji#x!2T{_|c`)`IxtA7hHvQ0oTRdM`x`g6HdLmQ-+WK{c+nH=^J0oNnUS}7oncsn?z-uuJ_yKP1yyW;mpsQ`wyOWn<_p&8pWr(X8}t% z-eJuS;A?=9qbIAF9u^uhy(3?v;#ux)9`sJ`$chcyeDX`;)5na`U0F>V)6bRL>tBQm zwH+T@m*WoeyH^n=2`ShT4Vz~RUnZ)zmqt*zwsP_s4O`|8$*4#w?HLh6iV zcYp%5fnB!(Blln+W#)M-(wiz_D34^N>+NV%he?^Lv2}bc4(cBijiEh9>S+A??oc7l zdCg%#F(Y93bMMdmqa{5zc>0Q!%bI?!%eKA>h}U8ZkYTkVjaZQpZ_~1`4HD=6+x`=Vef5XWtFC1trC%S6IF_Tx48HO}$o5wSBnnS`Lp)0Jx3Rlr<1iC*_z=q(&{b5Y3e^^%#3K97gHHkXM+qK}73 z_@CZhuCEVz)SMoq2a_4?dy^ZUZLj+-U7guC!H2p(2<+nxhjqMNL;6G9OzQP`W7?jH z>N}(30%wC4661zVl%&NFuN8%!6HbUwXajVHEtIU5v-jHURuW!olDglu?=ik1MNtBG z-dn+-2^~XD(hTO}Jhc8cZut#ALA_=cwn>BY3h+nPc;p1$YF5xfQot+*uU>UCfJoa6 z8+hB!bXM`K0soSTLf<}BlaUv{$!Fo%SP63lL`|m2fmQ*E#oyiCKrL;Wd9Q0&=__YH zS+g2FU>&*V1(~nhYyrL@V`>Sb`gkH&yd&qeDv_$gQ5x-vJ%}KYdR-whgBRxcf&fqX zV06bMpA$v_nr@sW(AheN`*xlwA~Ic7Pki5rOt9* z7X?Jsc*}^$+h6h}&O|?9h{(Gt%{PU@)iGuR?kOFYCPwtV!w_*+?NHdurU8+7@R_p=D{w;6)fJqfpNEE?+WF(v&K}*je*s=a*of8Tt=X1^gU&zuzrskC-ph&Ilor zJ*UEZC?bl>I3l5F_lLoqv(Jd%UHw;`aNVP8vy!F&NheOyEqIzXBuD9-M#!*@k{^PJ zg!YV~h4dHkT-K5L0mVI+*a^$V@l~`&bLBb`q3gBeKFELL)N<1QUm zf-1`DEYM<;E|;qzXH92dbmh4E$xc1sn>9D4?1 z3Qy9t37tJ^99|wszEHs#fiFa+)F>-CE=lqwZOXC!(lK>_)PuPMsUntaw z24y6JNV&O7b;Sd0amjgZjUEy4IX^ZVnKsMup#)af%|d0QoOKQN=>dv{f2~!rs$n}6 z)_844RIkYHzXtq&FsE!367y^HBQ16-%UXv zPe3NyQyT*~eUXz`08*}v?XYgX{?@4A-et-VTQJM57Hn>+r(UdpW=#>w0C#TtY?oJ@ zP+GwUPD>DJ)#e1S-26;*5J+Op>2|%5LrF3N_~QCZA9UR%^$7$)3Y$5_elazWX?`xo z>E0Uk0xosf;AvFt8pmaiifjnRpAK$OcqJAy!B4luLlOCd-hA@?tB=eZHGW+xwWOFI zI>wKNT?E9!{VXHl$K^kJafHy(NTd5sV8WhA;&l0vWvVRC8GeC|TMz)~j(tp<4Q$sD}`7>vV_Bh(O9P0$^{yf&P{+MV>)`XhVhPt3sB(OQMX!7AuzcwsM{G`~w#DH+62%QwsVuklrOqq{&-k%(aqh7Aq zhk*MrI*wOyA*O29+i3xY1mf$-*cP5-rp-92$42E8WY4<7NthcrY>AemE{31Cy~qxq zXY4ZSYy9((Sur)0VSJ)}yeb2UM90GV9SiGTn%ja3@A$Fl{38dw&Lq345M(E~4^Xf2*`;xXU+L z=uyqWjl;spo^XVqbb1USh?p6XQ_`HKu$Yb|Lnn8WEgzXa7l=@It0b3H#jeyD5EvHU z&0UPRhsFRMIix1#$mojTir=xj;M#$mw=r8~Thq2i;+$h0`&Dsk-C_W&o- zKRrucDv4|F&hw64VppBRQdk~#5i)3p2q=@ZiDBOtZVHwf0;hNMRmxT}@gV@iY?h2t z(`g4`4bw@AjGpxZVHF$&+X)Otox`YM!7%bk%AIrleNGgzQ3f`Kp-PWNNgScP-S}-T zQ@(<3bGE0b(x=qu@_4ac@?sS%lZRyuoG}k^D$@JB|aac2eWxiCfN~3q~ti9M8$Q&bnQBEwi40+;{tDYb@?k^ zue*4K;@@WJo81!_6zheaW)SRZ7_q)b@Z>UIxGm=F`%$=EL8U}kp?imTtiL>={(9}? z6oy5s-t)gm+{VGc3z!vNVRAQf(dQ#*#y4{JYSFko|lwLkdzp3i35UM%Tl>KUO>1!hczqdR2aN@h=j)x8!(Bu{opnC zEUgR{hGm6|4kCb`WxhUX$d?`nFK0A(oGO_mwZy6N39|ClF$jhx3Qh(&P=r4eWMdT6 zd4*p1EYQZrf%J`cL^L#OAoIR5in_T$aCii;veY18vI|zb7CK7yEuN_^V82TCan9g^ zB?l1RQC5&@BgtCE1qFajZ6^2j%bQbUxDb-pL$kY+3c#Anu~i?gWwfgpW*rIvy=2M* z)fpUP#0fM-@a@li_n-vUIB{fYEd;L|rkZmJ<&!=0pQu4+O)x2@;EqRN;Yu?tO{|`v z5l(3hTU1YxQOC*bkf?{Z<1|LucjCr|Jy!Wf}rMt-Codc8r{W25*TBRa`7Cz7C8n`Zj)p@B(}SR*u(c?|RRJOil=g zwPU6KR-$V1R=Ak+>y*Cf+lX9bRUKu38Tm&<1=@^;692F{fB>i)%+1K5m!*p%_Yd}elYYZ8 zU%wb`m1Gco^2r}4ZhcT6|2)e^X7k06o%mdT=h8A#%(v9bYm@H4=QY$a#xm}869djW zmaKf0;qXL(#iB{9iBbkE5wBsDqfN6K z`x36e3`jyZF}tZ(GM4Vc;cp^?Fs+Mex)z$c%8kM;Xpa|~YB%Ij%aS=Yi{GLZE~j4V zQ}pE3Q36r`u&euI4_)!6Ci+D8XVMv*1_`_Z z6LOsLDOL#({|Yxu6JRj%2Im=MTx@%sH-3}sVggo>v(k$Y_dLuO&s2s5e$n4NLV~uV zs#Y8Srty`N;UGcem$?5Rwd#aj6xlCojBojhRt|;jJ4M@m?cauNh5{|VNjjZLC7sdujYu|Tm|#RtJOA&p5<*6 z7>dUQHV%>fj_YgI==N>or;R@Xw}7-F50h~Eph?-_*OSg)0OGi*fEYWlUHB)wVaje! zPjoU`Q67%38$y0JB-s5!ShF7Wd5r0fF=_ z6Qp9NS*00LR1opX3<%O+b7(nkdo=whK=gx}%$_+xV?mg(S;r@*4OhlMEfE6z+WFOJ zhA-L^rTdt*`N?{$akBzMXux1XB<;JP1pub?_o@TTmQ78Wth-vzo-fH={P127E#R={ z#w&-cG3cDFv~q?C$`y!4&|0PS#i39ZxNsJDq!R8|r7rcOmi;~-nUD@Y0kRoETX zTX3J}L|SVIQm&U3_1tvTjG~|oMN$Hs)TltZ@jt1^J6EZ0nt~T~ ze%Fxh5(HTlPIcO0XNbho_nuz)=!n;XX$GQ6vcZ_ypr@x`1wd>^ovny7(77z>K)`9V zog)TB!&cJfUmD8Sfza(iWw{S+}+o#Gc$%G4| z>X=1Mk0VV2qiD&snnT(0@EOIc$8Jf5h5Qb2-DBFSxaNmDfan^8s$?VUp1ud|E{5GGML> z`Zh2H+Qs3c!Ye!EBz=YGOt)oA5jTTk$&qP06v=vi1ypq9ZpXI9Vg*-(b*_n!3EVp8 zD}pD|b(Wwb9`#Vg^W-a2u0paK&zO{@;$}#y*ZH%0D*$Zzwd2r3R)$5bO_#JfZ@xM4 zjD`DFtCD{nRD#1sD=oa=lz61@xaX{C%*^xc7GiTt($`R}nOIJSR_1VzRZD^b4A@(H zzrS#vbx>xAJoI!)7be@E9z2}LUi=EqNkMm(?ebO+n%qwB_$|~*>L8Rii@NN+Y$`AL}C1XX?8sh>$dH_znuC{`zt$+rr?dqz2{IPTMS)+3TCu?I7 z_4JpvjRf)0T0PS>@oq&AreYI`x_a&wfDsJyj~Q9uk8M$488mIwURJleonNg9CV&&a z4~O@uYv^(QmdEP1{{41TGO&n{54t2DI~Yo98@_L07~eY$Z-9E`hHNNn9NHka<#M5> ze08exp5TBe@7!Ry)O3c&aflA9m87h)${{b4BW)8?M1u66PDtjDTjFtQ*9<`#mkjb?#wp$m zX(0Y;MoniMI3jj44u}79XXQ~Nem}P8kOeZc;Ouc;*g2tReCC{>jaR*~ZQQ5|gnErL z+LPC5^GrASN5N0AZ9tqr_SxZEiJv_lGTbaUrQlT#khZk%v(wQQ}{E zz3?8X_4U_IAMG=VqUmG`EDxNd3b7^NAdeF5vJHOaS6-f&YSTHmA*PPcCyjm!*|8a& z&EaU<;em`Nm2f7YbZ)8kyR9e&7!7^Fw^o4p)43LMi{PTtW-cZYpnmo#Upxi$gwHftx{N(0YWHU2rPP&&&yZmTegOR z&o}d>4!qYh>;(t0kJX;=+dlHb3zsYy1Zh+K2RL>G@o|oTozmZd?9dY7SOR>vySG!I zxgfaoCj1Yi!D?p;DU9`^2g1rurKwSP9SWi8n@k` zA~cRE1#3XacU+M``hxl9b)h%iL)KDMIOqiokA~x zd0fTl=r7Gj#rLh4f|=qEm&!s_vK*TTr4Xdf4zU8XcRD5E=&95Kyr#tEchXV@-9+SU zbv-I|mq2z+QvwYj4d+Pg#qI>YH%71X12(6P&u2g2-fD;Y$=G3}lGi!>4SFM#518$? z**q(t1tzKGvx_ho7!!iAtC7I*VC7d;JVYpBkQ7Lqo2+uV(jU&iobo;Lx*sLrhB@0p2V_!NstyD8sBUz1IrPPAyS3S zrCCDkyV)J%K~?BGJJDu6x)<@M;Yo?$S^VNfpHaStvARNo&H{LV3)U5N9MZxVrme^jFDd zK=5y(4Co^0tq8Q{S~02xT4stRmJ_%3u*`TxDpE&krCBO*oLvBQ8c7AG-N86sG6Y}D zk!U>n_!nRdmHt%EOl%imSum8uCgM5*_d?AjroMZWskEW@D4cc?6boktH}U~^I8~Y7 z7$y}D*?d;SQPuN0GNOVIv}$-tD^4!g;7=;@Nwi^A(Ogevnj@!s9$O6+=cK|^F=YX{wTz zgn40!tr9AFQk0+aRI0#T`_cU)r#g*j4!UoIw?c&DQg~J5*?zMIdvspq{72H>h|_f- z^wLSLo%C$v_jk1~QhTJ6e7mR_P7HkL^0rc@+kUquESs8blAw`n>Z&~vj&NPeK6`%~U_i1R}n ztP8wVMX+y5-`Ecjt3m`rsd=4Chfj(n(zSLwiK@|>eXB4mE}QKA-Qo$dw=f|&ZDM~< zV_|1U>mrkEKb4sS(-s51og)Z8Ol)B_mUpX(J$QoI3(`c2^a~on6U{fWLHTDOQPTM5 zQasl;fU%b{sH>x)J5YpQE4C8XdewT;YiJOUezBX~yJ`zokzvuT09?;;|cs{@i5#EsSfx*j|jSXfTjbr=812C0obTKG_T_YsG?U2eE7} zKf~AccA>Byx!pBCE~~Sgl-D#Ue=Z+KwYaVH_-NETA_3c}sD%B`Ip1}kK2^K}f`Nz; zEv_x4VW@d(;&N)8tC9S#w05|k33p=Zi>JV{m}hO86r-!*(3Cu0wl zMG5gW#4rImw+sfNz?05xg9V%VVVmq&&Lv4eN^^<2*^+8c%PPpi_HkG$7FNuiyhD`r zCC`qd$20UgH&}}x1&F#P?PySz)h*g=@g=+MkU2M{Cwll4%A72u=}Q|2w(uxf^d%jr z9{X5KgD1Kd$KV_vZ&U$>)=Cg6c!{|hxYU(BIyuNf!sI@>em z8Fv*FbL!tC!_nXGyZPw8fN(C-_Cxx~hV>EI`GyfQm7=tgUHwMXE_tU*Vyz3D;&m_+ z%Hz07lVxWPk8wyYQ;eucKQUe9Qk?0}D(pSWpej<$D8b4gwh+!s8Y~iGgXshPaAQP} zYx=xK6`P9C5nww4QIVx zx(<-dLbvYv7O*jXt28dUbfibaaR^dMXf8KTno%G8+u8X81r*V3qBU=)SRIT=yX>;d z{GL?NGaSQYh^}{ghcS2^uy4}wHkISYylB#p-E~M&YuCuG%{go9cMwis1SNa2L-T9& zJ9-O^L`lPmz;o2K(dOzbDwE;Na-oHN&GDHyH9ZvuanP&*+aR~S>X$XGslMlDHY6^uMMWK>RQxfVPrx2i>H2$lO0(t zjA2C7#<;aPv#FioC1c%**Qx-@r%UoNk4f$!1tiKX+=SS+lBILtny>-AJ%_tN{HVX> z<>tO@@@vhDLTJhyjqW!gUAfH5c1GHcr7R!bD1z7u?y#4%i&kT80eN9p9mU#>ZS1WC zbi4ArgzsdlX9pU-SkHp8^5BkN^QA0lksI=kI4s z!Dq(bzx1NEHctO#ZA!~P%Rs>PPfWs}3jU}V3E2NpeU|(D{HuMYf45ISuk2=LLO?HX zU}i!5z4AZY7Kp!KO@AYkWUAz)-=Vjb|0hqph=pk%=S0pP^LzudZ=?%KulW2Is%s`rkq|{*2Y%F&&Hq9GpxX z|3~b`yoQH5ids_FLzYKbY1y-i4Es=C4x(lsI;e2J*eQaD!cKNBA~Lcv$$$jWfS9tA z{uq>@G=qXbq{6^s603Ox03}iu!&y~^3_0F*mBW&ZamQGQX{_80MzE#`OWw@I4Bz{- zX+zb%?RdRZ%lVwM>xTOPB>+iKSy9yIai52`>@l92yF5UcBA^=6F^TRzmb@c1Ko|{4 zFfwd4)Lk<9E6Ruk5%CFy)Z-6`2Z{UC<0S^}?dQ>pJK-Dv zEzoIeJ&gI~4^uB*Y-o`5Bda_YNzX4FU(Shyp);B%Id9)|^i( zOf+Hx_K|H@Exuzlb=rHq$XI-H6z0ww&1Dx}6ouxBcL$pa$ch`p&B|gkuO5uT&E=Nx z)Z|#rc+@;kKbYu9eOc}cz8|w<=PP_GVR1&EEIwIzxEW36zgT(DHzex#xm*$sC(}*J zq(s*Vx;G;Ziir*pkENa8gXL6lVPMZm<~z#V8eCWLq}n7*A>BJPgrrE6I8q>4QlLyb zFIisDsOV9WH>XjW+-yOtMdXZ9tJNLh+YvV-{lryZ9L_e6yWCz=X=4QKjJZ+&I{ZjH zzx6z>_kO*PPnYrG&}QFb&eq8`F)NKFy57`h4r)1;BAla7R5rk+wR z8yX%yuGp}|aW0Ul>fq8+R2y8P#D84xQeN6GGHUElZST6o13dg-Nb4CTZM4*g6Zo z?Yz+Pre&=Yya9IPFr9HLW=_j}veqVj6yc9M|X?JcWZXJXm9@}+ukv`Fmg7C zj3jGjyeo(G^TdV3MzHM);;LQ{gNgJGvtY_`WQt@+y*q8?b@e3} zILoah1R1L)7EV!adk0adAue%?cIWhPp@9u+QR)Stb=q@B6?u6t&sLhd0FVAqX`!eQ zo}duyzSaK)f;_w{FAj0e3Am73r#LPo!fC0&?52>{gUxy!nI&RD!#SSpEai2fW-O0- z!_v~jc5ix&0NZ|RxdHm1O6O(daqMvh`^B`2uV~%Pc*2S+0eH{p@RSo=W)*gz|5f9Z zz`$6XV1^C_9EX6*TA4~2c6u;$+v#`RT5&ens}wV0;*6)ItCC|{lmq^cU;axu?(O8d zF2V45cq|4%SuwGdvj^s(HKwn*N=1CjDhh<(G5D)2swy&?tq-a+1W-FR>LJg}+1gjF z=W=9lY-2$Z7|yLMTB8ZZ<>frVTsGb#71UHj&L8b4k5^=JCF}`eU5g8}HK=*Oi_NQN z;(4*O1x}t*%;A-IpfD%3qox@Ni);kHy14hi9X&?^$1k#Tt{~ z4vV^riiUeRgvI&MRr}CnLoWeO`k6xmqaIQ483!OBKS|@u$q_q1bTZLfb6a%0-oM_| z9lRb74$wNC>^4po;~0A^*jQHA4Vq@~@!M@~@x2`)l;(Evo4+=e>@RKM7M{x?^g`Rq z7#PbFAkl!HDwkHD2w3JxrV5fR(}bBGcXE!C6eVntnL5peN29u~KQJiWK)RFb3lenT zo4_0d_9RlXFS4>2ceL!DayGWTDKr`#M9sFXP`mbQwFG+iJw$ywO}Yl&&ZY;&OF%W9 z4i%bbO+*ORaX|={X<80(t6ou-X*fi-6hLy-QHB(O9>e%5&)tdlvQHZ+G;NDNz0vBx!_FK86`e? zApt^*gcxQFq}7aN;JA!rh1rO6Cw45n0y7HZ+^K0mU1we2qUt3_^-xlHX{&4c~pZ4ZgRX|-||RZy{iy}$xtqqIjM zv5L13sJYKqf^HBZg0H(iaVD4*Q8jx@jyOQP(bBo`OUgrgErw|t|;?ufL8H*@4?a$HciNG7R2QuE@<>&mAW)Tyl#?%wBoTX*Sw9aS->FwzI6V42D7W9#Alph^R%$yUk@f27b$=#<{>nJMKri?vlZns zVp^LUMUyct;&bS+RnL8%>sd{Wf#Ytx*AF*iZ@(9Ne?u|<{zid`V;22Z;O4zW!xcu= z9Gzz3+A3m{54v|8?XDF1lLah}Xr~?`JQF7*2J5u7isxKdS_n935rR6@2d@x+z;_#O z4Z5J+>1+R-knobzjw-h3qT>*?PBR=&{IN`=Or+FZ!9dYHPAbYJm?4NgkRAcw6!T&_ zb~<+Q$!O%{S!%j+M*V;mq86gsR#}MNtfP>kwW5@w)x+Gm#2PY`A@nws=hv(LntuEK z&h5wArP^jDJq^$3)CXfi;=@fk(mN&)GtZ*$yAcS~)w7EyUak6;LJS<_mQ-T;Fm5z;zz9o{6KoVOzIaMR2+=i`X7+dv=?I5N_PwJpke0}RZ^MIMzK z{2OIQb~#+y!wC3?+Jtw#>V#dJ4TBlw0}m7Wk&N5z(w$$!vXOk zX{X>^@s`%@p~#~gVH5Hc!TN><)?jsh6AndF3f0tJ&|XDkN91s1c9MndciHW`z9?a6 zN%Xiu$Et{rD8+cOpBiriM?q%E%4gQD_buoTY%CGwdY#({%9YyBXJ&qW z?=lyKgkSnj6LT=kDuD*Ju$fB25`>bCqteiVE#L#C)X8I;PBiRR zW+}txnNQO?oJ9KC+Ol0|FH^{J<&fA2AcNY^{j+CcS3g7?10XtVF_tYj9oj_kVT_@5 z;z%n&lE?iw6BMQ}^U|VRN?Sz=_63woL>dpnh|EfuinIwYAQqelP==RfE~47S1{S%; z@p*-BXLyC_14G^=$!CdyXNcaxhCiG)cAsgf4lGa}iaX{IU-_J=yw$TR(MqI_pgNUx z%R7hE%Cw$jb&0!jZR8(jKwe=x#k_PXRE(521=W>lp?G?YLTBP~k}tKM&G}a3(;)r( zzq!SbDoHi$eQU(9#w_}3^<<1SZuWG^aP7i({lFsCnPjP@uO%vFT{Rg`Om{ibu|c*o0IclS z%+#;Cb;O9uP!hVu6g*W(U$5lxw#VyLy{o;Bz8~;8vsS3w_Fd#|2zt zy-$nOf-4T32|eL765#fQ3^DXLKwJX`p)2l-=vQ(Fa#zHheC&#y6JEl{I0R!MS8Lf9 z_-TvWn}QlEK5B!4@{Giq9}1poH8?<78?9 zaJ!dQjH*x+v(j?{e3L3+;)p+xvUcS{+!TFl1853K<$IGuP-n1egH#1ct5Eh7SxX98 zt+1`Ow1gw;^-klD6PZVx3xrzm88J=Cz*!80%_Q}CmPeqMud~m;TMvB{W@V z5Qs@9PVv)tmRk6hq>R**Bp5J}A01u--v!gzmQIAZPWP`sf zuY|EnFiGH&gkA#qJ{vWwPWO|9Oe%g)+a!z>zOIWiKW4(@#(?f8{l3)Ao*7MDJ$X6w zsnifsYet2ii6`oEld>jZ^2lhyUXr3zXsfc8Nrk?pNMc9O+?0$JvDHCU80`$2TN#pg zN#l9_1*yl8O(QnkmC&}2(>K2S)5cB%?{HjSrPq82&Gj1g8ka4k5AYhEfKYRb_i4~b zGHK}d_wcf=4oo-|A=7 zt^oc7N)q#9I37+EgI@A(7}tq|4Zxb+Vc9j}w5vC;0U#rkB zS(}+d^&x6njAsS+;|LOUL;dmuh*yn%D*SH3Z-dsEY#TE(@Z-eRWaD8sK>n`leTR9) z%sSEC8}qsW1!Xf>UN-?h1X0#P1x~!)Z*HkTG%;{kxDld%e~L`sVopR*ry=%D)*6MX zEM1$BcU|04D0N0U_3+Avd{ODdH`P>z)=h?0AI~STA$7S1|M-AMSSbP?>-YrzXWXim z2g=4qpcuL__fFBXex3HmLGlkax!?bq4Dg>!K zY9bR^SCZ)MX(Cfb4Jz7DO_UA}Q&!4oZQKSfcjqy$T7FG2kQbQ~I3oXxq6f0F)t!O* zz$wY^jn=(H&xX(C!S{*$?_0H~YK0%Ztz!I0fK@Kn%T34Iy&Sp4T^_Z}yUl`^nfJ>Z zu?G~7$f_Yd?{vs{;Wb8Q+n`cgdvk*sUoCO(eqw4?dG(W*auG35`_$f1OZf3atOI6iz zD_+nd^M&@u3U~?kB5liyKqyXDdlmhDcCfs6D+X^KCw!-R{Y zzof3*yB;=I<3>}a}iW0YZDZ0@eSEz|W@}V-j z`2ZRp7$*El$sq%Wgxg>X#Xvcz3NSdaebiHcf1!brV75s~J7Omsh& zdtBRSeQbMw8#s`Cm~V>4o4=cz{+efJHR8m>>cJ7QlP4BfnMz-Q?)5rKS^O}bdZ2Zn zoy_2r>3BZ%PSWQ?<9)JvdjW;-0`rm7M@b0dU0wR7`f&9Uy+XEhL%UdQshgikAohdT z+7+?}d&7H`YdC9z)yw6ENHE+LQDdm9dm))dKXtd ztG9i?HV(;Ax-(=57rm9O+;SZfhu>6QJ_1T2Kjq;4b0#BobL>Fa zy#kZwW;w*Xk|nS1$3_e-T0-DxU}!DQ5UJjFB>V?#3^e&bpI zhn@5wCtU_I8^Q`{Jv-Z^(*B2_P}zeZ-jUQkp|>HDP-EKiHXd4(7@o0-Z`2A{ed`vz z*CQcpXI0R}>172_6U-wz|Lbo!@6sSN{#W}oknLzNPbFAAH6cf8&v~d1tL|i>EqD#P zS)Yxf9(4A?5c<+Buyt1>D9?9|u*Y^(J}KV~%*zs>#o5iDtCuSl&8FStbdQ57c6TJN z13uAL_t9!SVrl~+(W*U(jF($gdHum)uUuGs44Z0T?Y7+>VA_>IQs_qgq#$$yz5Mw- z^spRFqUZ;Po9J6p#@%gTud6S^?xFE9Fb|XHbAhK9?g26!!UFHk~k8BITVC_FV z3teAG+I0q^yY|or&A_-eU46Rtf{no1HDJYHISjvb0NG;LH6y&Q!#M}P=2jxW;W~v1 zd2t;2fl%D~sxH}f^MTxBRS$k`gLVCptka$a)^>VXCs^FCWl?5O1jMad7lsSgZW?%2 zB^q`XczMiLrv^DZQv0V4-OtBo~%CrOuC<4ZrqOtUOGpXu>f59 z`>%X^1H(9=7OdAHpHO(teqj(ihh9*Sft|!Fu!vFnFnfcTorX{x@`#2W43OF~11Hu9 zrcHqbJ)R{f4e~ADD+C7pe4vMDyU>?42E%5d2)mw312#sUsVyG^HVlhzg+M}JZT&H- z7(7FiU~LBSL1bW@ghKwlH8D2+^)D>3rrkfV@(az0=D{ejU=D2&K{BQ`fZt}aZSlJf zwIF`P>EuS^$LUOQ57WEzf06Jy+*=f;2Y+N0XCdhE$$|?RbRKByR)eswKnRx#p#`mi zHI(;f1QTu&R7MjvB%krY3q1RRjD!wd_ZM~;CwdBow9`VKU^`RW*oQpordm^|)Tvvva!2 z*JPtj=mB4MWEW`Q#J0(#=Pz)0#l)qMpf6Upu6krmb|!*0zSdaI(O{Dzq9b&V$ezq< zb$MxgfQSF}i?MBwA;#KQyv_;SRJ??RGYd`nK%in|5D!3@qP?_Z8}Bt0VB63jg9gKFJ#%FCzG{X?5z# zGG%`IH{tRWQ~+E#r*P$xR;-$8c}Yktro<#?n1k{}MtqPX$=;A(3jrS!ekg4nsqOH3 zTnZ}{0LBuA(b9Ovjz8>KAx;D7rmOgtK20M&S%b{9gXArX`GGS2MY&`$i}eTB<;w8P z^ek@6%jT``WPPPtzaM)iQP}aculi;aqh|7xzPz#$_7^pfceS)=r=ZhLx1FYN#DE`hO z_>)KZccztr*(X6``nRRv-;9rzk%5VUfR=@giGY!vlbL{#iJ9qNtWUzpzzSd_U}I)w zLcs7>IRQtbKd|r*Is4<#AOHTyv`oybe<_3v?8Hp~X6Bzf@w1x0R7y@J)~W;?f0zFw zi2oz}>EY*|X~IPChvWS>*ZadZ|Jj@Wrip)S|E|v2(CIITsr*R}|K$Cp6*O=(`HPbN zzl{vQ!O=;`+`!?V0gyKMm(0lYpAq@baDTf0mrH*-DPr^alsCY}jDTJZU?X7T2>3_w zSIcSuV<&URf3ea(N8$h1n~nKT&;I{;voidtm+jMWHf9zARu;~`QZ`0b0%q2~b!^Pc z1WfGge|=fmIsZyI*;zjAm_N%g{%!wRmgRGs^>d$K=KNERnShn^6P7Zv|CRpPX8i{S ze?nbb6&t{xhu{hRT7mw14jKQ$z5ijZ1dRWI#{b)r^xt%riJh74|D?0ZmQKp^sOv7B z9;RfeV&^dN-ynSZI7X;J$pRpRx%)1mqJcxx>SeI?F^B?@Ag;}^Bnp5+4YM~dQIzs2 z1nh-LASJ9(rWwf4|RSPG%tr z%dr5)tV^bo$N`TRLrQfbiuB)eM5n>(bi#|JOSvD<>>_H;xxJ&U-5|WLt>t6}S%`Z! z>&HX05l*nN>)b)$}%AA~hvbQhd_W~_T zsmD{~S-+?ByA-l^WJp*5?nt0PYfhuMX`@n){Kk2_R5h$SB9ELqIM%-Q#`s_guTnXg zeLfLu?nv}&oK40~O#f5AS@<{ziD=N_FuMmu77`$6BatP_T@!hmr*%FoK>cQSeIPO; zuQTtOfkTYwc}?|{)F`rUu?&))lYLOWY=+5!?^<|gb(mT(K#+QzaD8k zk?i-ZHb5g)(!uG5UGr6Uvsb^Z&##qw6-D6;K`meeo^To9EyP1DRf(IQESljL%rhr(po+l-%BC ztwgl>7%Z7gtd9m9rOwc_7KK`~OZ(W`jnkHe5-L*rH5pXe7m^=ntP1I4i^aa*%{@TB z?un_BNOv54nV%Bs#hG5NZzy!yx}PHDBFwTZhK1zH;92MQ zm&fy|@E1=`rQb&1eIHq%zWv^zTs6g1Behm_KZF%(M8@7B@>VRtEraTWfgTg8$}8tG z1E|mQosdX6e6!5sT9!%7?a!mlPb`$2g)vxHKC7467p!Tr~^V!xB#%ar@d$J`5xPR|5Bh#UehEpnc}|;Rl>_f zoi{BrdaGFBOLth1q=EFJUQaaaHpL=D1*KS4Zel8AfmJcDUqKP68DFv%S7kQj7OGzo zR~m{&6uv)P8FxPmBqtm#7|$e4kv@9uJR;H3Q{snE||k=0OVOj za^}tCeV_t#t`-PEG+n&u=?)x^%?b`Vbno`|yuI{zosDYiIQD<|D&#Gk5y*`K%aS=< z&nv|qO>AquE`m4+`BNn^<({A=lb?A0Kl>7U{VlJ^B?T?z-FrfLZb|j7J3d?JNp!wZ z=#h5@I2P`|Rqyt7cxja%0gfiQaav zkYsMD)nA?0S6p2vR4>l=^K1AXixU-=EzCXBB@&Ny_xxqt^Wezm-+NyDWcX&f ztGKD&pO%fU1gA$GtvJ#uZ@fxUTk_w!_SLJd9PU}A^+%)O)S-&dmq+;}wIxo^jYvCw zDZ=JRgpi{vOSf`_h@)k);$!xnpDTaZCC}3iKdXJ9xo4Bs8Bu|hGl?t`n-~!223&hhZ3~cwe&pJ0F z?XbCI@N$VPt>mu8lRchEGmo5@pRwYP+>9A-0)%;e3KDFRV>ptXPbA;T_6q;*HsN-Y z;p7MDHO+_7uyP?^$_)b*TQ^?A33AN`?RcbKgf=rtSR7c(R%Ljg z_g{Hc=zqf@mA@&LjDbSbNAy7=F242={s+?XZctc1d%sgs0$_ zbFPaTT-??(RCz70G7@?c;H30OQqGOPEn4Od&-5#=PPGQM?qHb1({Af%DN*)uoBDxc z9i{?@H+0ROn)va6OoN_VY@UNn3+w6IR|1xmWUzHCG|gjPg;CG=z4LR!u}aCM!5wQ+Y67FGctT?)|tdFEK_&x z@IR9^nzjYUCOlNx#iwH>SAO;7vUHZnv&Zilz5iU6a(n)xYHyBJkJ9>679Ca1Y+iD> zWsz-R=>@BGehC}?S8Uzwmi}(ms$H6|CPuqvxv#34zshK7%}>#bYh326d$;tMR%6Y9 z>#u@*j)tDsXWv?U8B0jQn!&)3G%_-W2P9~jA7<+q(Rc;7OF^w+NM9kuInvcRl#ITD z3EZGT)KvgCD^c2+$cq4RbrzgGT|fZ>l7=+*5gi8X!2{F~Tv-h4t$608<%7Zjv_3N^ zH9Zs9X9o7J9Dyr4Q^EZzP@C1iC?&NB)I-qlOaXR>GD|AqQo*GK1v#m?piUCF2LxTx z2yzS}!hjtwV3YUg)*1}&8Ueit>2DG8Caf!? zpbvAot_RpGNY5@M5!gcmIu+EvF$8zgAd5hen*ETYu@I|GL4yvci4(Ni6w*KS&P)NB z1nyeFmy?zPgT@fagNCpn4p4{(6y>LsCIb)L(nwCsEXr3f)H5{I19ovVGD=Dctn`6v zWr3pUMTrF&z~h_r@{7_nA)Q@k;Bj0fnfZAxpmnwyE>=bcMy3XaMuvul1{TI<+6IQ| z1_qi)mI8Y_=+1Ud1onRO)4`zu_5yT?EW)c5MX70A1`0-oTn2EUU}kD+Y^so^01-2^ z1a-K;vI==nF#`(|P+159NMc4780riRu&FZxmaiy!4Nc8~eRni5OG8kaLQ!X60qPi{ ziW!=MI{z4ACcwTMie5ug6Ek4^ql*DMgJ@!AMxZ_(syYiJQ!HZU80J}+0+$-2>oqe# z_lKc{1*Tq0V-w6UFfj$DC74@EiV}fi4MpH%-GVc#QbF+s%Dh4O`6UYA_=jXbP=*Iq cT;L1|9wSREDgno|p{1#z5tpi}tG^o;0I#v!z5oCK literal 0 HcmV?d00001 diff --git a/cosigt_smk/workflow/Snakefile b/cosigt_smk/workflow/Snakefile index 8471793..07081b2 100644 --- a/cosigt_smk/workflow/Snakefile +++ b/cosigt_smk/workflow/Snakefile @@ -27,15 +27,15 @@ cosigt_input=list() for region in config['region']: for sample in df['sample_id'].tolist(): cosigt_input.append(config['output'] + '/cosigt/'+ sample + '/' + region + '/cosigt_genotype.tsv') - cosigt_input.append(config['output'] + '/cosigt_dbscan/'+ sample + '/' + region + '/cosigt_genotype.tsv') - cosigt_input.append(config['output'] + '/bwa-mem2/'+ sample + '/' + region + '.realigned.bam.all.pdf') + #cosigt_input.append(config['output'] + '/cosigt_old/'+ sample + '/' + region + '/cosigt_genotype.tsv') + #cosigt_input.append(config['output'] + '/bwa-mem2/'+ sample + '/' + region + '.realigned.bam.all.pdf') #skip coverage calculation and plotting cosigt_input.append(config['output'] + '/odgi/viz/' + region + '.png') cosigt_input.append(config['output'] + '/odgi/dissimilarity/' + region + '.tsv') cosigt_input.append(config['output'] + '/odgi/view/' + region + '.node.length.tsv') cosigt_input.append(config['output'] + '/pgrtk/bundles/' + region + '.bstruct.dist.tsv') if config['annotations'] != '': cosigt_input.append(config['output'] + '/odgi/untangle/' + region + '.gggenes.pdf') - cosigt_input.append(config['output'] + '/odgi/untangle_dbscan/' + region + '.gggenes.pdf') + #cosigt_input.append(config['output'] + '/odgi/untangle_old/' + region + '.gggenes.pdf') rule cosigt: input: @@ -46,4 +46,4 @@ rule benchmark: input: config['output'] + '/benchmark/resources.time.pdf', config['output'] + '/benchmark/resources.mem.pdf', - config['output'] + '/benchmark/tpr.pdf' \ No newline at end of file + config['output'] + '/benchmark/tpr.pdf' diff --git a/cosigt_smk/workflow/rules/bedtools.smk b/cosigt_smk/workflow/rules/bedtools.smk index 1982c51..b2f3926 100644 --- a/cosigt_smk/workflow/rules/bedtools.smk +++ b/cosigt_smk/workflow/rules/bedtools.smk @@ -24,7 +24,33 @@ rule bedtools_merge: bedtools sort \ -i {input} | \ bedtools merge \ - -d 100000 \ + -d 200000 \ -i - > {output} \ && cat {params.ref_region} >> {output} - ''' \ No newline at end of file + ''' + +rule filter_outliers: + ''' + https://github.com/davidebolo1993/cosigt + ''' + input: + rules.bedtools_merge.output + output: + config['output'] + '/bedtools/filtered/{region}.bed' + threads: + 1 + resources: + mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], + time=lambda wildcards, attempt: attempt * config['default']['time'] + container: + 'docker://davidebolo1993/renv:4.3.3' + conda: + '../envs/r.yaml' + benchmark: + 'benchmarks/{region}.filter_outliers.benchmark.txt' + shell: + ''' + Rscript workflow/scripts/outliers.r \ + {input} \ + {output} + ''' \ No newline at end of file diff --git a/cosigt_smk/workflow/rules/cosigt.smk b/cosigt_smk/workflow/rules/cosigt.smk index ab16d5e..2799a61 100644 --- a/cosigt_smk/workflow/rules/cosigt.smk +++ b/cosigt_smk/workflow/rules/cosigt.smk @@ -1,4 +1,4 @@ -rule cosigt_genotype: +rule cosigt_genotype_old: ''' https://github.com/davidebolo1993/cosigt ''' @@ -7,8 +7,8 @@ rule cosigt_genotype: sample_cov_map=rules.gafpack_coverage.output, json=rules.make_clusters.output output: - config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv', - config['output'] + '/cosigt/{sample}/{region}/sorted_combos.tsv' + config['output'] + '/cosigt_old/{sample}/{region}/cosigt_genotype.tsv', + config['output'] + '/cosigt_old/{sample}/{region}/sorted_combos.tsv' threads: 1 resources: @@ -19,10 +19,10 @@ rule cosigt_genotype: conda: '../envs/cosigt.yaml' params: - prefix=config['output'] + '/cosigt/{sample}/{region}', + prefix=config['output'] + '/cosigt_old/{sample}/{region}', sample_id='{sample}' benchmark: - 'benchmarks/{sample}.{region}.cosigt_genotype.benchmark.txt' + 'benchmarks/{sample}.{region}.cosigt_genotype_old.benchmark.txt' shell: ''' cosigt \ @@ -33,17 +33,17 @@ rule cosigt_genotype: -i {params.sample_id} ''' -rule cosigt_genotype_dbscan: +rule cosigt_genotype: ''' https://github.com/davidebolo1993/cosigt ''' input: graph_cov_map=rules.odgi_paths_matrix.output, sample_cov_map=rules.gafpack_coverage.output, - json=rules.make_dbscan_clusters.output + json=rules.make_clusters.output output: - config['output'] + '/cosigt_dbscan/{sample}/{region}/cosigt_genotype.tsv', - config['output'] + '/cosigt_dbscan/{sample}/{region}/sorted_combos.tsv' + config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv', + config['output'] + '/cosigt/{sample}/{region}/sorted_combos.tsv' threads: 1 resources: @@ -57,7 +57,7 @@ rule cosigt_genotype_dbscan: prefix=config['output'] + '/cosigt/{sample}/{region}', sample_id='{sample}' benchmark: - 'benchmarks/{sample}.{region}.cosigt_genotype_dbscan.benchmark.txt' + 'benchmarks/{sample}.{region}.cosigt_genotype.benchmark.txt' shell: ''' cosigt \ diff --git a/cosigt_smk/workflow/rules/odgi.smk b/cosigt_smk/workflow/rules/odgi.smk index 28253d4..55e0f53 100644 --- a/cosigt_smk/workflow/rules/odgi.smk +++ b/cosigt_smk/workflow/rules/odgi.smk @@ -13,7 +13,7 @@ rule odgi_chop: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -40,7 +40,7 @@ rule odgi_view: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -66,7 +66,7 @@ rule odgi_paths_matrix: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -144,7 +144,7 @@ rule odgi_similarity: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -171,7 +171,7 @@ rule odgi_dissimilarity: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -184,14 +184,14 @@ rule odgi_dissimilarity: --distances > {output} ''' -rule make_clusters: +rule make_clusters_old: ''' https://github.com/davidebolo1993/cosigt ''' input: rules.odgi_similarity.output output: - config['output'] + '/cluster/{region}.clusters.json' + config['output'] + '/cluster_old/{region}.clusters.json' threads: 1 resources: @@ -202,7 +202,7 @@ rule make_clusters: conda: '../envs/r.yaml' benchmark: - 'benchmarks/{region}.make_clusters.benchmark.txt' + 'benchmarks/{region}.make_clusters_old.benchmark.txt' shell: ''' Rscript workflow/scripts/cluster.r \ @@ -210,14 +210,15 @@ rule make_clusters: {output} ''' -rule make_dbscan_clusters: +rule make_clusters: ''' https://github.com/davidebolo1993/cosigt ''' input: - rules.odgi_dissimilarity.output + matrix=rules.odgi_dissimilarity.output, + filter=rules.filter_nodes.output output: - config['output'] + '/cluster_dbscan/{region}.clusters.json' + config['output'] + '/cluster/{region}.clusters.json' threads: 1 resources: @@ -229,10 +230,12 @@ rule make_dbscan_clusters: '../envs/r.yaml' benchmark: 'benchmarks/{region}.make_clusters.benchmark.txt' + params: + threshold_file=config['output'] + '/odgi/paths/matrix/{region}.shared.tsv' shell: ''' - Rscript workflow/scripts/cluster_dbscan.r \ - {input} \ + Rscript workflow/scripts/cluster.r \ + {input.matrix} \ {output} \ 0.95 ''' @@ -253,7 +256,7 @@ rule odgi_viz: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -341,7 +344,7 @@ rule odgi_procbed: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -408,7 +411,7 @@ rule odgi_inject: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -436,7 +439,7 @@ rule odgi_flip: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -466,7 +469,7 @@ rule odgi_untangle: mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'], time=lambda wildcards, attempt: attempt * config['default']['time'] container: - 'docker://pangenome/odgi:1736526388' + 'docker://pangenome/odgi:1738101065' conda: '../envs/odgi.yaml' benchmark: @@ -477,18 +480,20 @@ rule odgi_untangle: -R {input.txt} \ -i {input.og} \ -j 0.3 \ - -g > {output} + -g \ + -m 1000 \ + -e 1000 > {output} ''' -rule plot_gggenes: +rule plot_gggenes_old: ''' https://github.com/pangenome/odgi ''' input: tsv=rules.odgi_untangle.output, - json=rules.make_clusters.output + json=rules.make_clusters_old.output output: - config['output'] + '/odgi/untangle/{region}.gggenes.pdf' + config['output'] + '/odgi/untangle_old/{region}.gggenes.pdf' threads: 1 resources: @@ -499,7 +504,7 @@ rule plot_gggenes: conda: '../envs/r.yaml' benchmark: - 'benchmarks/{region}.plot_gggenes.benchmark.txt' + 'benchmarks/{region}.plot_gggenes_old.benchmark.txt' shell: ''' Rscript \ @@ -509,15 +514,15 @@ rule plot_gggenes: {output} ''' -rule plot_gggenes_dbscan: +rule plot_gggenes: ''' https://github.com/pangenome/odgi ''' input: tsv=rules.odgi_untangle.output, - json=rules.make_dbscan_clusters.output + json=rules.make_clusters.output output: - config['output'] + '/odgi/untangle_dbscan/{region}.gggenes.pdf' + config['output'] + '/odgi/untangle/{region}.gggenes.pdf' threads: 1 resources: diff --git a/cosigt_smk/workflow/rules/samtools.smk b/cosigt_smk/workflow/rules/samtools.smk index 7ba62b9..cb30f14 100644 --- a/cosigt_smk/workflow/rules/samtools.smk +++ b/cosigt_smk/workflow/rules/samtools.smk @@ -75,7 +75,7 @@ rule samtools_faidx_extract: ''' input: fasta=rules.add_target_to_queries.output, - bed=rules.bedtools_merge.output + bed=rules.filter_outliers.output output: config['output'] + '/samtools/faidx/{region}.fasta' threads: @@ -92,7 +92,7 @@ rule samtools_faidx_extract: shell: ''' samtools faidx \ - -r <(awk '{{print $1":"$2+1"-"$3}}' {input.bed}) \ + -r <(awk '{{print $1":"$2"-"$3}}' {input.bed}) \ {input.fasta} > {output} ''' diff --git a/cosigt_smk/workflow/scripts/annotate.r b/cosigt_smk/workflow/scripts/annotate.r index e68a719..e95795a 100644 --- a/cosigt_smk/workflow/scripts/annotate.r +++ b/cosigt_smk/workflow/scripts/annotate.r @@ -12,14 +12,28 @@ gtf<-args[1] path<-args[2] bed<-args[3] +#split region +reg<-unlist(strsplit(unlist(strsplit(basename(gtf), ".", fixed=T))[1], "_")) +start<-as.numeric(reg[3]) +end<-as.numeric(reg[4]) + #process df<-import(gtf) genes<-unique(df$gene_name) +count<-0 for (g in genes) { subdf<-as.data.frame(df[df$gene_name == g]) - subdf<-head(subdf[order(subdf$end-subdf$start,decreasing=TRUE),],1)[c("seqnames", "start", "end", "gene_name")] + subdf<-head(subdf[order(subdf$end-subdf$start,decreasing=TRUE),],1)[c("seqnames", "start", "end", "gene_name", "gene_type")] if (nrow(subdf)>=1) { - subdf$seqnames<-path - fwrite(subdf, file = bed, sep = "\t", row.names = FALSE, col.names = FALSE, append = T) + if (subdf$start >= start && subdf$end <= end) { + count<-count+1 + subdf$seqnames<-path + fwrite(subdf, file = bed, sep = "\t", row.names = FALSE, col.names = FALSE, append = T) + } } +} + +if (count == 0) { + subdf<-data.frame(seqnames=path, start=start, end=end, gene_name="unknown", gene_type="unknown") + fwrite(subdf, file = bed, sep = "\t", row.names = FALSE, col.names = FALSE) } \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/cluster.r b/cosigt_smk/workflow/scripts/cluster.r index 4b929d5..585844d 100644 --- a/cosigt_smk/workflow/scripts/cluster.r +++ b/cosigt_smk/workflow/scripts/cluster.r @@ -3,10 +3,8 @@ # Load required libraries library(data.table) library(reshape2) -library(NbClust) library(rjson) -library(dendextend) -library(randomcoloR) +library(dbscan) # Set data.table threads setDTthreads(1) @@ -15,40 +13,25 @@ setDTthreads(1) args <- commandArgs(trailingOnly = TRUE) input_file <- args[1] output_file <- args[2] +similarity_threshold<-as.numeric(args[3]) # Read and process input data df <- fread(input_file) -df$jaccard.distance <- 1 - df$jaccard.similarity - -#find outliers -z.scores.a <- (df$group.a.length - mean(df$group.a.length)) / sd(df$group.a.length) -z.scores.b <- (df$group.b.length - mean(df$group.b.length)) / sd(df$group.b.length) -outliers.a<-df[abs(z.scores.a) > 3] -outliers.b<-df[abs(z.scores.b) > 3] -outliers<-unique(c(which(abs(z.scores.a) > 3),which(abs(z.scores.b) > 3))) -if (length(outliers)==0) { - excluded<-c() -} else { - newdf<-df[-outliers] - excluded<-unique(df$group.a[df$group.a%in%newdf$group.a == FALSE]) - df<-newdf -} # Create distance matrix -regularMatrix <- acast(df, group.a ~ group.b, value.var = "jaccard.distance") -#max distance if NA -regularMatrix[is.na(regularMatrix)]<-1 -distanceMatrix <- as.dist(regularMatrix) +regularMatrix <- acast(df, group.a ~ group.b, value.var = "estimated.difference.rate") +maxD<-max(regularMatrix[!is.na(regularMatrix)]) +regularMatrix[is.na(regularMatrix)]<-Inf +normRegularMatrix<-regularMatrix/maxD +distanceMatrix <- as.dist(normRegularMatrix) -# Calculate silhouette score and best partition -max_cluster <- round(length(unique(df$group.a)) / 5) ##control -res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette", - distance = NULL, max.nc = max_cluster)$Best.partition +eps<-1-similarity_threshold +res <- dbscan(distanceMatrix, eps=eps, minPts = 1)$cluster +names(res)<-labels(distanceMatrix) # Format results res.list <- lapply(split(res, names(res)), unname) -named_res <- lapply(res.list, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup") -for (e in excluded) {named_res[[e]] <- "*"} +named_res <- lapply(res, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup") jout <- toJSON(named_res) # Write JSON output @@ -57,18 +40,18 @@ write(jout, output_file) # Create reversed data reversed_data <- list() for (key in names(named_res)) { - value <- named_res[[key]] - if (!is.null(reversed_data[[value]])) { - reversed_data[[value]] <- c(reversed_data[[value]], key) - } else { - reversed_data[[value]] <- key - } + value <- named_res[[key]] + if (!is.null(reversed_data[[value]])) { + reversed_data[[value]] <- c(reversed_data[[value]], key) + } else { + reversed_data[[value]] <- key + } } # Create haplotype table haplotable <- data.frame( - haplotype.name = unlist(reversed_data), - haplotype.group = rep(names(reversed_data), lengths(reversed_data)) + haplotype.name = unlist(reversed_data), + haplotype.group = rep(names(reversed_data), lengths(reversed_data)) ) rownames(haplotable) <- NULL @@ -76,70 +59,32 @@ rownames(haplotable) <- NULL tsv_output <- gsub(".json", ".tsv", output_file) fwrite(haplotable, tsv_output, row.names = FALSE, col.names = TRUE, sep = "\t") -# Simplify matrix names for hclust -simplify_names <- function(x) unlist(strsplit(x, ":"))[1] -colnames(regularMatrix) <- sapply(colnames(regularMatrix), simplify_names) -rownames(regularMatrix) <- sapply(rownames(regularMatrix), simplify_names) - -# Perform hierarchical clustering k <- as.numeric(tail(sort(res), 1)) -hc <- hclust(as.dist(regularMatrix), method = "average") - -#calculate average distances between clusters -clusters <- cutree(hc, k = k) - -cluster_dist <- matrix(0, nrow = k, ncol = k) -rownames(cluster_dist) <- paste0("HaploGroup", 1:k) -colnames(cluster_dist) <- paste0("HaploGroup", 1:k) +#write distances? +cluster_dist_norm <- matrix(0, nrow = k, ncol = k) +rownames(cluster_dist_norm) <- paste0("HaploGroup", 1:k) +colnames(cluster_dist_norm) <- paste0("HaploGroup", 1:k) +cluster_dist<-cluster_dist_norm # Calculate mean distances between clusters for(i in 1:(k-1)) { for(j in (i+1):k) { # Get indices for each cluster - cluster_i <- which(clusters == i) - cluster_j <- which(clusters == j) + cluster_i <- which(res == i) + cluster_j <- which(res == j) # Calculate mean distance between clusters - distances <- regularMatrix[cluster_i, cluster_j, drop = FALSE] + distances <- normRegularMatrix[cluster_i, cluster_j, drop = FALSE] mean_dist <- mean(distances) - # Store distances symmetrically + cluster_dist_norm[i,j] <- mean_dist + cluster_dist_norm[j,i] <- mean_dist + distances <- regularMatrix[cluster_i, cluster_j, drop = FALSE] + mean_dist <- mean(distances) cluster_dist[i,j] <- mean_dist cluster_dist[j,i] <- mean_dist } } -distance_output <- gsub(".json", ".hapdist.tsv", output_file) -fwrite(data.frame(h.group=row.names(cluster_dist),cluster_dist), distance_output, row.names = FALSE, col.names = TRUE, sep = "\t") - -#plot -palette <- distinctColorPalette(k) - -dend <- as.dendrogram(hc) %>% - set("branches_k_color", value=palette, k = k) %>% - set("labels_col", value=palette,k = k) %>% - set("labels_cex", 0.6) -#map leaf colors to haplogroups -leaf_colors <- get_leaves_branches_col(dend) -leaf_names <- get_leaves_attr(dend, "label") -cluster_map <- data.frame( - leaf = as.character(leaf_names), - cluster = clusters[leaf_names], - color = leaf_colors -) -cluster_colors <- sapply(unique(clusters), function(cluster) { - leaf_index <- which(clusters == cluster)[1] - leaf_colors[leaf_index] -}) -cluster_info <- data.frame( - cluster = paste0("HaploGroup", unique(clusters)), - color = cluster_colors -) - -unique_clusters <- cluster_map[!duplicated(cluster_map$cluster), ] -#actual plot -pdf(gsub(".json", ".pdf", output_file), width = 20, height = 15) -dend %>% plot(horiz = TRUE) -legend("topleft", - legend = paste0("HaploGroup", unique_clusters$cluster), - fill = unique_clusters$color, - cex = 0.6) -dev.off() \ No newline at end of file +distance_output <- gsub(".json", ".hapdist.norm.tsv", output_file) +fwrite(data.frame(h.group=row.names(cluster_dist_norm),cluster_dist_norm), distance_output, row.names = FALSE, col.names = TRUE, sep = "\t") +distance_output <- gsub(".json", ".hapdist.tsv", output_file) +fwrite(data.frame(h.group=row.names(cluster_dist),cluster_dist), distance_output, row.names = FALSE, col.names = TRUE, sep = "\t") \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/cluster_dbscan.r b/cosigt_smk/workflow/scripts/cluster_dbscan.r deleted file mode 100644 index c2d112d..0000000 --- a/cosigt_smk/workflow/scripts/cluster_dbscan.r +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env Rscript - -# Load required libraries -library(data.table) -library(reshape2) -library(rjson) -library(dbscan) - -# Set data.table threads -setDTthreads(1) - -# Parse command-line arguments -args <- commandArgs(trailingOnly = TRUE) -input_file <- args[1] -output_file <- args[2] -similarity_threshold<-as.numeric(args[3]) - -# Read and process input data -df <- fread(input_file) - -#find outliers -z.scores.a <- (df$group.a.length - mean(df$group.a.length)) / sd(df$group.a.length) -z.scores.b <- (df$group.b.length - mean(df$group.b.length)) / sd(df$group.b.length) -outliers.a<-df[abs(z.scores.a) > 3] -outliers.b<-df[abs(z.scores.b) > 3] -outliers<-unique(c(which(abs(z.scores.a) > 3),which(abs(z.scores.b) > 3))) -if (length(outliers)==0) { - excluded<-c() -} else { - newdf<-df[-outliers] - excluded<-unique(df$group.a[df$group.a%in%newdf$group.a == FALSE]) - df<-newdf -} - -# Create distance matrix -regularMatrix <- acast(df, group.a ~ group.b, value.var = "estimated.difference.rate") -#max distance if NA -regularMatrix[is.na(regularMatrix)]<-1.0 -distanceMatrix <- as.dist(regularMatrix) - -#calculate dbscan -maxD<-max(distanceMatrix) -normD<-distanceMatrix/maxD -#no need to normalize - max_distance here is 1 and values we have are in that range -res <- dbscan(normD, eps = 1-similarity_threshold, minPts = 1)$cluster -names(res)<-labels(distanceMatrix) - -# Format results -res.list <- lapply(split(res, names(res)), unname) -named_res <- lapply(res, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup") -for (e in excluded) {named_res[[e]] <- "*"} -jout <- toJSON(named_res) - -# Write JSON output -write(jout, output_file) - -# Create reversed data -reversed_data <- list() -for (key in names(named_res)) { - value <- named_res[[key]] - if (!is.null(reversed_data[[value]])) { - reversed_data[[value]] <- c(reversed_data[[value]], key) - } else { - reversed_data[[value]] <- key - } -} - -# Create haplotype table -haplotable <- data.frame( - haplotype.name = unlist(reversed_data), - haplotype.group = rep(names(reversed_data), lengths(reversed_data)) -) -rownames(haplotable) <- NULL - -# Write haplotype table -tsv_output <- gsub(".json", ".tsv", output_file) -fwrite(haplotable, tsv_output, row.names = FALSE, col.names = TRUE, sep = "\t") diff --git a/cosigt_smk/workflow/scripts/cluster_old.r b/cosigt_smk/workflow/scripts/cluster_old.r new file mode 100644 index 0000000..4b929d5 --- /dev/null +++ b/cosigt_smk/workflow/scripts/cluster_old.r @@ -0,0 +1,145 @@ +#!/usr/bin/env Rscript + +# Load required libraries +library(data.table) +library(reshape2) +library(NbClust) +library(rjson) +library(dendextend) +library(randomcoloR) + +# Set data.table threads +setDTthreads(1) + +# Parse command-line arguments +args <- commandArgs(trailingOnly = TRUE) +input_file <- args[1] +output_file <- args[2] + +# Read and process input data +df <- fread(input_file) +df$jaccard.distance <- 1 - df$jaccard.similarity + +#find outliers +z.scores.a <- (df$group.a.length - mean(df$group.a.length)) / sd(df$group.a.length) +z.scores.b <- (df$group.b.length - mean(df$group.b.length)) / sd(df$group.b.length) +outliers.a<-df[abs(z.scores.a) > 3] +outliers.b<-df[abs(z.scores.b) > 3] +outliers<-unique(c(which(abs(z.scores.a) > 3),which(abs(z.scores.b) > 3))) +if (length(outliers)==0) { + excluded<-c() +} else { + newdf<-df[-outliers] + excluded<-unique(df$group.a[df$group.a%in%newdf$group.a == FALSE]) + df<-newdf +} + +# Create distance matrix +regularMatrix <- acast(df, group.a ~ group.b, value.var = "jaccard.distance") +#max distance if NA +regularMatrix[is.na(regularMatrix)]<-1 +distanceMatrix <- as.dist(regularMatrix) + +# Calculate silhouette score and best partition +max_cluster <- round(length(unique(df$group.a)) / 5) ##control +res <- NbClust(diss = distanceMatrix, method = "average", index = "silhouette", + distance = NULL, max.nc = max_cluster)$Best.partition + +# Format results +res.list <- lapply(split(res, names(res)), unname) +named_res <- lapply(res.list, function(x, prefix) paste0(prefix, x), prefix = "HaploGroup") +for (e in excluded) {named_res[[e]] <- "*"} +jout <- toJSON(named_res) + +# Write JSON output +write(jout, output_file) + +# Create reversed data +reversed_data <- list() +for (key in names(named_res)) { + value <- named_res[[key]] + if (!is.null(reversed_data[[value]])) { + reversed_data[[value]] <- c(reversed_data[[value]], key) + } else { + reversed_data[[value]] <- key + } +} + +# Create haplotype table +haplotable <- data.frame( + haplotype.name = unlist(reversed_data), + haplotype.group = rep(names(reversed_data), lengths(reversed_data)) +) +rownames(haplotable) <- NULL + +# Write haplotype table +tsv_output <- gsub(".json", ".tsv", output_file) +fwrite(haplotable, tsv_output, row.names = FALSE, col.names = TRUE, sep = "\t") + +# Simplify matrix names for hclust +simplify_names <- function(x) unlist(strsplit(x, ":"))[1] +colnames(regularMatrix) <- sapply(colnames(regularMatrix), simplify_names) +rownames(regularMatrix) <- sapply(rownames(regularMatrix), simplify_names) + +# Perform hierarchical clustering +k <- as.numeric(tail(sort(res), 1)) +hc <- hclust(as.dist(regularMatrix), method = "average") + +#calculate average distances between clusters +clusters <- cutree(hc, k = k) + +cluster_dist <- matrix(0, nrow = k, ncol = k) +rownames(cluster_dist) <- paste0("HaploGroup", 1:k) +colnames(cluster_dist) <- paste0("HaploGroup", 1:k) +# Calculate mean distances between clusters +for(i in 1:(k-1)) { + for(j in (i+1):k) { + # Get indices for each cluster + cluster_i <- which(clusters == i) + cluster_j <- which(clusters == j) + # Calculate mean distance between clusters + distances <- regularMatrix[cluster_i, cluster_j, drop = FALSE] + mean_dist <- mean(distances) + + # Store distances symmetrically + cluster_dist[i,j] <- mean_dist + cluster_dist[j,i] <- mean_dist + } +} +distance_output <- gsub(".json", ".hapdist.tsv", output_file) +fwrite(data.frame(h.group=row.names(cluster_dist),cluster_dist), distance_output, row.names = FALSE, col.names = TRUE, sep = "\t") + +#plot +palette <- distinctColorPalette(k) + +dend <- as.dendrogram(hc) %>% + set("branches_k_color", value=palette, k = k) %>% + set("labels_col", value=palette,k = k) %>% + set("labels_cex", 0.6) + +#map leaf colors to haplogroups +leaf_colors <- get_leaves_branches_col(dend) +leaf_names <- get_leaves_attr(dend, "label") +cluster_map <- data.frame( + leaf = as.character(leaf_names), + cluster = clusters[leaf_names], + color = leaf_colors +) +cluster_colors <- sapply(unique(clusters), function(cluster) { + leaf_index <- which(clusters == cluster)[1] + leaf_colors[leaf_index] +}) +cluster_info <- data.frame( + cluster = paste0("HaploGroup", unique(clusters)), + color = cluster_colors +) + +unique_clusters <- cluster_map[!duplicated(cluster_map$cluster), ] +#actual plot +pdf(gsub(".json", ".pdf", output_file), width = 20, height = 15) +dend %>% plot(horiz = TRUE) +legend("topleft", + legend = paste0("HaploGroup", unique_clusters$cluster), + fill = unique_clusters$color, + cex = 0.6) +dev.off() \ No newline at end of file diff --git a/cosigt_smk/workflow/scripts/filter.r b/cosigt_smk/workflow/scripts/filter.r index db10e92..2eb64b9 100644 --- a/cosigt_smk/workflow/scripts/filter.r +++ b/cosigt_smk/workflow/scripts/filter.r @@ -9,6 +9,8 @@ cov<-args[1] len<-args[2] flt<-args[3] mask<-args[4] +shared<-0 +total<-0 #process dfcov<-fread(cov, header=T) @@ -17,11 +19,14 @@ dflen<-fread(len, header=F) keep<-data.frame(V1=dflen$V1, V2 = 1) m<-unlist(strsplit(flt, ":")) for (i in 2:ncol(dfcov)) { - print(i) id<-paste0("node.", i-1) + dif<-diff(dfcov[[id]]) + if (all(dif == 0)) { + shared<-shared+dflen$V2[i-1] + } + total<-total+dflen$V2[i-1] if ((length(m) == 2 && m[1] == "common_filter") || (length(m) == 1 && m[1] == "common_filter")) { - if (all(diff(dfcov[[id]]) == 0)) { - print(dfcov[[id]]) + if (all(dif == 0)) { keep$V2[i-1]<-0 } } @@ -35,4 +40,7 @@ for (i in 2:ncol(dfcov)) { #store mask keep$V1 <- NULL -fwrite(keep, file = mask, sep = "\t", row.names = FALSE, col.names = FALSE) \ No newline at end of file +fwrite(keep, file = mask, sep = "\t", row.names = FALSE, col.names = FALSE) +#store pct shared +shared<-data.frame("len_shared" = shared, "len_total" = total, "pct_shared" = shared/total) +fwrite(shared, file = gsub(".mask.tsv", ".shared.tsv", mask), sep = "\t", row.names = FALSE, col.names = TRUE) diff --git a/cosigt_smk/workflow/scripts/outliers.r b/cosigt_smk/workflow/scripts/outliers.r new file mode 100644 index 0000000..41feabb --- /dev/null +++ b/cosigt_smk/workflow/scripts/outliers.r @@ -0,0 +1,42 @@ +#!/usr/bin/env Rscript + +library(data.table) +library(reshape2) +library(dbscan) + +#set data.table threads +setDTthreads(1) + +#parse input +args <- commandArgs(trailingOnly = TRUE) +bed_in<-args[1] +bed_out<-args[2] +#read +df<-fread(bed_in) +#process +difflist<-list() +for (i in 1:nrow(df)) { + h1<-df$V1[i] + h1len<-df$V3[i]-df$V2[i] + for (j in 1:nrow(df)) { + h2<-df$V1[j] + h2len<-df$V3[j]-df$V2[j] + pct<-1-min(h1len,h2len)/max(h1len,h2len) + ddf<-data.frame(h1=h1, h2=h2, diff=pct) + difflist[[paste0(h1,h2)]]<-ddf + } +} +diffdf<-do.call(rbind,difflist) +rownames(diffdf)<-NULL +regularMatrix <- acast(diffdf, h1~h2, value.var = "diff") +distanceMatrix <- as.dist(regularMatrix) +res <- dbscan(distanceMatrix, eps=0.2, minPts = 10)$cluster # +#noise points are 0s +outliers<-labels(distanceMatrix)[which(res == 0)] +#remove from df +if (length(outliers) != 0) { + df<-df[-which(df$V1 %in% outliers)] +} +#write out +fwrite(df, bed_out, row.names = FALSE, col.names = FALSE, sep = "\t") +