Skip to content

Commit

Permalink
refactoring of dbscan clustering, mod odgi untangle command, output c…
Browse files Browse the repository at this point in the history
…luster distances (norm and not), fix odgi mask
  • Loading branch information
davidebolo1993 committed Jan 29, 2025
1 parent 7ef2ac8 commit c48a7ad
Show file tree
Hide file tree
Showing 13 changed files with 326 additions and 218 deletions.
Binary file modified .DS_Store
Binary file not shown.
Binary file added cosigt_smk/rulegraph.pdf
Binary file not shown.
8 changes: 4 additions & 4 deletions cosigt_smk/workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,15 +27,15 @@ cosigt_input=list()
for region in config['region']:
for sample in df['sample_id'].tolist():
cosigt_input.append(config['output'] + '/cosigt/'+ sample + '/' + region + '/cosigt_genotype.tsv')
cosigt_input.append(config['output'] + '/cosigt_dbscan/'+ sample + '/' + region + '/cosigt_genotype.tsv')
cosigt_input.append(config['output'] + '/bwa-mem2/'+ sample + '/' + region + '.realigned.bam.all.pdf')
#cosigt_input.append(config['output'] + '/cosigt_old/'+ sample + '/' + region + '/cosigt_genotype.tsv')
#cosigt_input.append(config['output'] + '/bwa-mem2/'+ sample + '/' + region + '.realigned.bam.all.pdf') #skip coverage calculation and plotting
cosigt_input.append(config['output'] + '/odgi/viz/' + region + '.png')
cosigt_input.append(config['output'] + '/odgi/dissimilarity/' + region + '.tsv')
cosigt_input.append(config['output'] + '/odgi/view/' + region + '.node.length.tsv')
cosigt_input.append(config['output'] + '/pgrtk/bundles/' + region + '.bstruct.dist.tsv')
if config['annotations'] != '':
cosigt_input.append(config['output'] + '/odgi/untangle/' + region + '.gggenes.pdf')
cosigt_input.append(config['output'] + '/odgi/untangle_dbscan/' + region + '.gggenes.pdf')
#cosigt_input.append(config['output'] + '/odgi/untangle_old/' + region + '.gggenes.pdf')

rule cosigt:
input:
Expand All @@ -46,4 +46,4 @@ rule benchmark:
input:
config['output'] + '/benchmark/resources.time.pdf',
config['output'] + '/benchmark/resources.mem.pdf',
config['output'] + '/benchmark/tpr.pdf'
config['output'] + '/benchmark/tpr.pdf'
30 changes: 28 additions & 2 deletions cosigt_smk/workflow/rules/bedtools.smk
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,33 @@ rule bedtools_merge:
bedtools sort \
-i {input} | \
bedtools merge \
-d 100000 \
-d 200000 \
-i - > {output} \
&& cat {params.ref_region} >> {output}
'''
'''

rule filter_outliers:
'''
https://github.com/davidebolo1993/cosigt
'''
input:
rules.bedtools_merge.output
output:
config['output'] + '/bedtools/filtered/{region}.bed'
threads:
1
resources:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://davidebolo1993/renv:4.3.3'
conda:
'../envs/r.yaml'
benchmark:
'benchmarks/{region}.filter_outliers.benchmark.txt'
shell:
'''
Rscript workflow/scripts/outliers.r \
{input} \
{output}
'''
20 changes: 10 additions & 10 deletions cosigt_smk/workflow/rules/cosigt.smk
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
rule cosigt_genotype:
rule cosigt_genotype_old:
'''
https://github.com/davidebolo1993/cosigt
'''
Expand All @@ -7,8 +7,8 @@ rule cosigt_genotype:
sample_cov_map=rules.gafpack_coverage.output,
json=rules.make_clusters.output
output:
config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv',
config['output'] + '/cosigt/{sample}/{region}/sorted_combos.tsv'
config['output'] + '/cosigt_old/{sample}/{region}/cosigt_genotype.tsv',
config['output'] + '/cosigt_old/{sample}/{region}/sorted_combos.tsv'
threads:
1
resources:
Expand All @@ -19,10 +19,10 @@ rule cosigt_genotype:
conda:
'../envs/cosigt.yaml'
params:
prefix=config['output'] + '/cosigt/{sample}/{region}',
prefix=config['output'] + '/cosigt_old/{sample}/{region}',
sample_id='{sample}'
benchmark:
'benchmarks/{sample}.{region}.cosigt_genotype.benchmark.txt'
'benchmarks/{sample}.{region}.cosigt_genotype_old.benchmark.txt'
shell:
'''
cosigt \
Expand All @@ -33,17 +33,17 @@ rule cosigt_genotype:
-i {params.sample_id}
'''

rule cosigt_genotype_dbscan:
rule cosigt_genotype:
'''
https://github.com/davidebolo1993/cosigt
'''
input:
graph_cov_map=rules.odgi_paths_matrix.output,
sample_cov_map=rules.gafpack_coverage.output,
json=rules.make_dbscan_clusters.output
json=rules.make_clusters.output
output:
config['output'] + '/cosigt_dbscan/{sample}/{region}/cosigt_genotype.tsv',
config['output'] + '/cosigt_dbscan/{sample}/{region}/sorted_combos.tsv'
config['output'] + '/cosigt/{sample}/{region}/cosigt_genotype.tsv',
config['output'] + '/cosigt/{sample}/{region}/sorted_combos.tsv'
threads:
1
resources:
Expand All @@ -57,7 +57,7 @@ rule cosigt_genotype_dbscan:
prefix=config['output'] + '/cosigt/{sample}/{region}',
sample_id='{sample}'
benchmark:
'benchmarks/{sample}.{region}.cosigt_genotype_dbscan.benchmark.txt'
'benchmarks/{sample}.{region}.cosigt_genotype.benchmark.txt'
shell:
'''
cosigt \
Expand Down
57 changes: 31 additions & 26 deletions cosigt_smk/workflow/rules/odgi.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ rule odgi_chop:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand All @@ -40,7 +40,7 @@ rule odgi_view:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand All @@ -66,7 +66,7 @@ rule odgi_paths_matrix:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand Down Expand Up @@ -144,7 +144,7 @@ rule odgi_similarity:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand All @@ -171,7 +171,7 @@ rule odgi_dissimilarity:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand All @@ -184,14 +184,14 @@ rule odgi_dissimilarity:
--distances > {output}
'''

rule make_clusters:
rule make_clusters_old:
'''
https://github.com/davidebolo1993/cosigt
'''
input:
rules.odgi_similarity.output
output:
config['output'] + '/cluster/{region}.clusters.json'
config['output'] + '/cluster_old/{region}.clusters.json'
threads:
1
resources:
Expand All @@ -202,22 +202,23 @@ rule make_clusters:
conda:
'../envs/r.yaml'
benchmark:
'benchmarks/{region}.make_clusters.benchmark.txt'
'benchmarks/{region}.make_clusters_old.benchmark.txt'
shell:
'''
Rscript workflow/scripts/cluster.r \
{input} \
{output}
'''

rule make_dbscan_clusters:
rule make_clusters:
'''
https://github.com/davidebolo1993/cosigt
'''
input:
rules.odgi_dissimilarity.output
matrix=rules.odgi_dissimilarity.output,
filter=rules.filter_nodes.output
output:
config['output'] + '/cluster_dbscan/{region}.clusters.json'
config['output'] + '/cluster/{region}.clusters.json'
threads:
1
resources:
Expand All @@ -229,10 +230,12 @@ rule make_dbscan_clusters:
'../envs/r.yaml'
benchmark:
'benchmarks/{region}.make_clusters.benchmark.txt'
params:
threshold_file=config['output'] + '/odgi/paths/matrix/{region}.shared.tsv'
shell:
'''
Rscript workflow/scripts/cluster_dbscan.r \
{input} \
Rscript workflow/scripts/cluster.r \
{input.matrix} \
{output} \
0.95
'''
Expand All @@ -253,7 +256,7 @@ rule odgi_viz:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand Down Expand Up @@ -341,7 +344,7 @@ rule odgi_procbed:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand Down Expand Up @@ -408,7 +411,7 @@ rule odgi_inject:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand Down Expand Up @@ -436,7 +439,7 @@ rule odgi_flip:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand Down Expand Up @@ -466,7 +469,7 @@ rule odgi_untangle:
mem_mb=lambda wildcards, attempt: attempt * config['default']['mem_mb'],
time=lambda wildcards, attempt: attempt * config['default']['time']
container:
'docker://pangenome/odgi:1736526388'
'docker://pangenome/odgi:1738101065'
conda:
'../envs/odgi.yaml'
benchmark:
Expand All @@ -477,18 +480,20 @@ rule odgi_untangle:
-R {input.txt} \
-i {input.og} \
-j 0.3 \
-g > {output}
-g \
-m 1000 \
-e 1000 > {output}
'''

rule plot_gggenes:
rule plot_gggenes_old:
'''
https://github.com/pangenome/odgi
'''
input:
tsv=rules.odgi_untangle.output,
json=rules.make_clusters.output
json=rules.make_clusters_old.output
output:
config['output'] + '/odgi/untangle/{region}.gggenes.pdf'
config['output'] + '/odgi/untangle_old/{region}.gggenes.pdf'
threads:
1
resources:
Expand All @@ -499,7 +504,7 @@ rule plot_gggenes:
conda:
'../envs/r.yaml'
benchmark:
'benchmarks/{region}.plot_gggenes.benchmark.txt'
'benchmarks/{region}.plot_gggenes_old.benchmark.txt'
shell:
'''
Rscript \
Expand All @@ -509,15 +514,15 @@ rule plot_gggenes:
{output}
'''

rule plot_gggenes_dbscan:
rule plot_gggenes:
'''
https://github.com/pangenome/odgi
'''
input:
tsv=rules.odgi_untangle.output,
json=rules.make_dbscan_clusters.output
json=rules.make_clusters.output
output:
config['output'] + '/odgi/untangle_dbscan/{region}.gggenes.pdf'
config['output'] + '/odgi/untangle/{region}.gggenes.pdf'
threads:
1
resources:
Expand Down
4 changes: 2 additions & 2 deletions cosigt_smk/workflow/rules/samtools.smk
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ rule samtools_faidx_extract:
'''
input:
fasta=rules.add_target_to_queries.output,
bed=rules.bedtools_merge.output
bed=rules.filter_outliers.output
output:
config['output'] + '/samtools/faidx/{region}.fasta'
threads:
Expand All @@ -92,7 +92,7 @@ rule samtools_faidx_extract:
shell:
'''
samtools faidx \
-r <(awk '{{print $1":"$2+1"-"$3}}' {input.bed}) \
-r <(awk '{{print $1":"$2"-"$3}}' {input.bed}) \
{input.fasta} > {output}
'''

Expand Down
20 changes: 17 additions & 3 deletions cosigt_smk/workflow/scripts/annotate.r
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,28 @@ gtf<-args[1]
path<-args[2]
bed<-args[3]

#split region
reg<-unlist(strsplit(unlist(strsplit(basename(gtf), ".", fixed=T))[1], "_"))
start<-as.numeric(reg[3])
end<-as.numeric(reg[4])

#process
df<-import(gtf)
genes<-unique(df$gene_name)
count<-0
for (g in genes) {
subdf<-as.data.frame(df[df$gene_name == g])
subdf<-head(subdf[order(subdf$end-subdf$start,decreasing=TRUE),],1)[c("seqnames", "start", "end", "gene_name")]
subdf<-head(subdf[order(subdf$end-subdf$start,decreasing=TRUE),],1)[c("seqnames", "start", "end", "gene_name", "gene_type")]
if (nrow(subdf)>=1) {
subdf$seqnames<-path
fwrite(subdf, file = bed, sep = "\t", row.names = FALSE, col.names = FALSE, append = T)
if (subdf$start >= start && subdf$end <= end) {
count<-count+1
subdf$seqnames<-path
fwrite(subdf, file = bed, sep = "\t", row.names = FALSE, col.names = FALSE, append = T)
}
}
}

if (count == 0) {
subdf<-data.frame(seqnames=path, start=start, end=end, gene_name="unknown", gene_type="unknown")
fwrite(subdf, file = bed, sep = "\t", row.names = FALSE, col.names = FALSE)
}
Loading

0 comments on commit c48a7ad

Please sign in to comment.