From c0fe313d6dfaea3ec7283f7720a75599367cfa0f Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Thu, 7 Jan 2016 11:18:53 +0000 Subject: [PATCH 1/8] do not require a val input for config options --- annot.nf | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/annot.nf b/annot.nf index 03cdd38..92fc57a 100644 --- a/annot.nf +++ b/annot.nf @@ -78,9 +78,6 @@ if (params.do_contiguation) { file sanitized_genome_file file ref_chr file ref_dir - val params.ref_species - val params.GENOME_PREFIX - val params.ABACAS_BIN_CHR output: file 'pseudo.pseudochr.fasta' into pseudochr_seq @@ -589,7 +586,6 @@ process integrate_genemodels { input: file 'merged.gff3' from merged_gff3 file 'sequence.fasta' from pseudochr_seq_integrate - val params.WEIGHT_FILE output: file 'integrated.gff3' into integrated_gff3 @@ -787,7 +783,6 @@ process split_splice_models_at_gaps { process add_polypeptides { input: file 'input.gff3' from genemodels_with_gaps_split_gff3 - val params.CHR_PATTERN output: file 'output.gff3' into genemodels_with_polypeptides_gff3 @@ -854,7 +849,6 @@ proteins_target.into{ proteins_orthomcl process make_ref_input_for_orthomcl { input: file omcl_pepfile - val params.ref_species output: file 'out.gg' into gg_file @@ -873,7 +867,6 @@ process make_ref_input_for_orthomcl { process make_target_input_for_orthomcl { input: file 'pepfile.fas' from proteins_orthomcl - val params.GENOME_PREFIX output: file 'out.gg' into gg_file_ref @@ -1135,9 +1128,7 @@ if (params.use_reference) { input: file 'in.protein.fasta' from refcomp_protein_in set file('pseudo.in.annotation.gff3'), file('scafs.in.annotation.gff3') from refcomp_gff3_in - val params.GENOME_PREFIX file ref_dir - val params.ref_species output: file 'tree_selection.fasta' into tree_fasta @@ -1245,9 +1236,6 @@ if (params.do_contiguation && params.do_circos) { file 'refannot.gff3' from ref_annot file 'blast.in' from circos_blastout file ref_dir - val params.ref_species - val params.CHR_PATTERN - val params.ABACAS_BIN_CHR output: file 'links.txt' into circos_input_links @@ -1405,7 +1393,6 @@ process make_report { input: set file('pseudo.fasta.gz'), file('scaf.fasta.gz') from report_inseq set file('pseudo.gff3'), file('scaf.gff3') from report_gff3 - val params.SPECK_TEMPLATE val specfile output: From d2fc2519b508ff318f200c16b562dfd0baaa7b15 Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Thu, 7 Jan 2016 11:20:39 +0000 Subject: [PATCH 2/8] make ABACAS match size and similarity parameterizable --- annot.nf | 3 ++- params_default.config | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/annot.nf b/annot.nf index 92fc57a..4d4e559 100644 --- a/annot.nf +++ b/annot.nf @@ -89,7 +89,8 @@ if (params.do_contiguation) { """ abacas2.nonparallel.sh \ - ${ref_chr} ${sanitized_genome_file} 500 85 0 3000 + "${ref_chr}" "${sanitized_genome_file}" "${params.ABACAS_MATCH_SIZE}" \ + "${params.ABACAS_MATCH_SIM}" 0 3000 abacas_combine.lua . pseudo "${ref_dir}" "${params.ref_species}" \ "${params.GENOME_PREFIX}" "${params.ABACAS_BIN_CHR}" \ "${params.GENOME_PREFIX}" diff --git a/params_default.config b/params_default.config index 6f69682..2b49bf8 100644 --- a/params_default.config +++ b/params_default.config @@ -29,6 +29,10 @@ params { MAX_GENE_LENGTH = 20000 ABACAS_BIN_CHR = "LDON_0" + // ABACAS min match length and similarity + ABACAS_MATCH_SIZE = 500 + ABACAS_MATCH_SIM = 85 + // RATT parameters RATT_TRANSFER_TYPE = 'Species' From 7ea286719a8efdbebfad285e6a06a92d7d4acaad Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Wed, 13 Jan 2016 16:17:02 +0000 Subject: [PATCH 3/8] remove dead code --- bin/update_references.lua | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/bin/update_references.lua b/bin/update_references.lua index dfa3cf4..abf0b5f 100755 --- a/bin/update_references.lua +++ b/bin/update_references.lua @@ -22,11 +22,6 @@ require("lib") require("table_save") local json = require ("dkjson") -function file_exists(name) - local f=io.open(name,"r") - if f~=nil then io.close(f) return true else return false end -end - ncrna_visitor = gt.custom_visitor_new() function ncrna_visitor:visit_feature(fn) for fn2 in fn:get_children() do @@ -81,17 +76,6 @@ function ncrna_visitor:visit_feature(fn) return 0 end -gg_visitor = gt.custom_visitor_new() -function gg_visitor:visit_feature(fn) - for fn2 in fn:get_children() do - local idv = fn2:get_attribute("ID") - if idv and self.transcript_ids[idv] then - -- pass - end - end - return 0 -end - quotes_visitor = gt.custom_visitor_new() function quotes_visitor:visit_feature(fn) for fn2 in fn:get_children() do @@ -298,6 +282,7 @@ for name, values in pairs(refs.species) do .. values.gff .. " > " .. name .. "/annotation_preclean.gff3") end + -- fix up annotation stat_visitor.stats = {} stat_visitor.stats.nof_genes = 0 stat_visitor.stats.nof_coding_genes = 0 @@ -322,8 +307,6 @@ for name, values in pairs(refs.species) do end return node end - - -- fix up annotations out_stream = gt.gff3_out_stream_new_retainids(fixup_stream, name .. "/annotation.gff3") local gn = out_stream:next_tree() while (gn) do From 5af9df35b4c5bace26b2f26a8eb566f145b6ecca Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Wed, 13 Jan 2016 16:17:20 +0000 Subject: [PATCH 4/8] trim leading/trailing Ns from input seqs --- annot.nf | 2 +- bin/trim_wildcards.lua | 62 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) create mode 100755 bin/trim_wildcards.lua diff --git a/annot.nf b/annot.nf index 4d4e559..1f52b3a 100644 --- a/annot.nf +++ b/annot.nf @@ -62,7 +62,7 @@ process sanitize_input { file 'sanitized.fasta' into sanitized_genome_file """ - sed 's/['\\''+ &]/_/g' ${genome_file} > sanitized.fasta + sed 's/['\\''+ &]/_/g' ${genome_file} | trim_wildcards.lua > sanitized.fasta """ } diff --git a/bin/trim_wildcards.lua b/bin/trim_wildcards.lua new file mode 100755 index 0000000..2520d21 --- /dev/null +++ b/bin/trim_wildcards.lua @@ -0,0 +1,62 @@ +#!/usr/bin/env gt + +--[[ + Author: Sascha Steinbiss + Copyright (c) 2015 Genome Research Ltd + + Permission to use, copy, modify, and distribute this software for any + purpose with or without fee is hereby granted, provided that the above + copyright notice and this permission notice appear in all copies. + + THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +]] + +package.path = gt.script_dir .. "/?.lua;" .. package.path +require("lib") + +function chomp(s) + return string.gsub(s, "\n$", "") +end + +function usage() + io.stderr:write(string.format("Usage: %s\n", arg[0])) + io.stderr:write("Removes leading/trailing Ns from input FASTA sequences.\n") + os.exit(1) +end + +local i, j = 0, 0 +hdr = "" +seq = {} + +function replace(inseq) + inseq = inseq:gsub("^[Nn]+", ""):gsub("[Nn]+$", "") + return inseq +end + +for l in io.lines() do + if string.sub(l,1,1) == '>' then + if i > 0 then + j = j + 1 + i = 0 + print(">"..hdr) + thisseq = replace(table.concat(seq,"")) + print_max_width(thisseq, io.stdout, 60) + end + hdr = string.sub(l, 2) + seq = {} + i = i + 1 + else + table.insert(seq, tostring(chomp(l))) + end +end +if #seq > 0 then + print(">"..hdr) + thisseq = replace(table.concat(seq,"")) + print_max_width(thisseq, io.stdout, 60) +end From b6ac181e2c5edc4692c7612ee0ba6e04f20d539c Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Tue, 12 Jan 2016 17:06:16 +0000 Subject: [PATCH 5/8] address issues from ENA validator --- bin/gff3_to_embl.lua | 39 ++++++++++++++++--------------------- bin/split_genes_at_gaps.lua | 15 ++++++++------ 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/bin/gff3_to_embl.lua b/bin/gff3_to_embl.lua index cb7575f..3fa3c32 100755 --- a/bin/gff3_to_embl.lua +++ b/bin/gff3_to_embl.lua @@ -240,28 +240,17 @@ function embl_vis:visit_feature(fn) format_embl_attrib(node , "ID", "locus_tag", nil) if fn:get_type() == "pseudogene" then io.write("FT /pseudo\n") - format_embl_attrib(pp, "product", "note", - function (s) - local pr_a = gff3_extract_structure(s) - local gprod = pr_a[1].term - if gprod then - return "product: " .. gprod - else - return nil - end - end) - else - format_embl_attrib(pp, "product", "product", - function (s) - local pr_a = gff3_extract_structure(s) - local gprod = pr_a[1].term - if gprod then - return gprod - else - return nil - end - end) end + format_embl_attrib(pp, "product", "product", + function (s) + local pr_a = gff3_extract_structure(s) + local gprod = pr_a[1].term + if gprod then + return gprod + else + return nil + end + end) format_embl_attrib(pp, "Dbxref", "EC_number", function (s) m = string.match(s, "EC:([0-9.-]+)") @@ -281,7 +270,10 @@ function embl_vis:visit_feature(fn) if node:get_type() == "mRNA" then protseq = node:extract_and_translate_sequence("CDS", true, region_mapping) - io.write("FT /translation=\"" .. protseq:sub(1,-2) .."\"\n") + if protseq:sub(-1,-1) == "*" then + protseq = protseq:sub(1,-2) + end + io.write("FT /translation=\"" .. protseq .."\"\n") end io.write("FT /transl_table=1\n") -- orthologs @@ -348,6 +340,7 @@ function embl_vis:visit_feature(fn) io.write(" (" .. node:get_attribute("anticodon") .. ")") end io.write("\"\n") + io.write("FT /gene=\"" .. fn:get_attribute("ID") .. "\"\n") end format_embl_attrib(node , "ID", "locus_tag", nil) elseif string.match(node:get_type(), "snRNA") or string.match(node:get_type(), "snoRNA") then @@ -361,6 +354,7 @@ function embl_vis:visit_feature(fn) end io.write("\n") io.write("FT /ncRNA_class=\"" .. node:get_type() .. "\"\n") + io.write("FT /gene=\"" .. fn:get_attribute("ID") .. "\"\n") format_embl_attrib(node , "ID", "locus_tag", nil) elseif string.match(node:get_type(), "rRNA") then io.write("FT rRNA ") @@ -373,6 +367,7 @@ function embl_vis:visit_feature(fn) end io.write("\n") io.write("FT /product=\"" .. node:get_type() .. "\"\n") + io.write("FT /gene=\"" .. fn:get_attribute("ID") .. "\"\n") format_embl_attrib(node , "ID", "locus_tag", nil) elseif string.match(node:get_type(), "gap") then io.write("FT gap ") diff --git a/bin/split_genes_at_gaps.lua b/bin/split_genes_at_gaps.lua index df0b915..7a8c0b7 100755 --- a/bin/split_genes_at_gaps.lua +++ b/bin/split_genes_at_gaps.lua @@ -63,16 +63,20 @@ function stream:process_current_cluster() -- count and collect gaps between scaffolds -- these always need to be broken for _,n in ipairs(self.curr_gene_set) do - if n:get_type() == "gap" - and n:get_attribute("gap_type") == "between scaffolds" then - table.insert(gaps, n) + if n:get_type() == "gap" then + table.insert(stream.outqueue, n) + if n:get_attribute("gap_type") == "between scaffolds" then + table.insert(gaps, n) + end end end - -- no gaps in cluster, so just pass along genes + -- no relevant gaps in cluster, so just pass along genes if #gaps == 0 then for _,n in ipairs(self.curr_gene_set) do - table.insert(stream.outqueue, n) + if n:get_type() ~= "gap" then + table.insert(stream.outqueue, n) + end end return 0 end @@ -132,7 +136,6 @@ function stream:process_current_cluster() end if not skip_gene then table.insert(outbuf, cur_gene) - table.insert(outbuf, g) end cur_gene = rest end From 51ba0200fe457f3d7a19c9e17035820caea2b6cd Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Thu, 14 Jan 2016 17:28:32 +0000 Subject: [PATCH 6/8] remove redundant code --- bin/abacas_combine.lua | 4 ++-- bin/lib.lua | 7 ++++++- bin/trim_wildcards.lua | 9 ++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bin/abacas_combine.lua b/bin/abacas_combine.lua index e9f13ed..ba2d6e5 100755 --- a/bin/abacas_combine.lua +++ b/bin/abacas_combine.lua @@ -198,7 +198,7 @@ table.sort(newkeys) -- for all toplevel seqs... for _,seqid in ipairs(newkeys) do pseudochr_fasta_out:write(">" .. seqid .. "\n") - print_max_width(pseudochr_seq[seqid], pseudochr_fasta_out, 60) + print_max_width(trim_ns(pseudochr_seq[seqid]), pseudochr_fasta_out, 60) print(seqid .. ": " .. #scafs[seqid]) local i = 1 local s_last_stop = 0 @@ -252,4 +252,4 @@ end for k,v in pairs(ref_target_chromosome_map) do ref_target_mapping_out:write(k .. "\t" .. v[1] .. "\t" .. v[2] .. "\n") -end \ No newline at end of file +end diff --git a/bin/lib.lua b/bin/lib.lua index 8e462af..26030cd 100644 --- a/bin/lib.lua +++ b/bin/lib.lua @@ -278,6 +278,11 @@ function deep_copy(root, copy, table) return copy end +function trim_ns(inseq) + inseq = inseq:gsub("^[Nn]+", ""):gsub("[Nn]+$", "") + return inseq +end + function print_max_width(str, ioo, width) local i = 1 while (i + width - 1) < str:len() do @@ -368,4 +373,4 @@ function print_r ( t ) sub_print_r(t," ") end print() -end \ No newline at end of file +end diff --git a/bin/trim_wildcards.lua b/bin/trim_wildcards.lua index 2520d21..51ea45b 100755 --- a/bin/trim_wildcards.lua +++ b/bin/trim_wildcards.lua @@ -34,18 +34,13 @@ local i, j = 0, 0 hdr = "" seq = {} -function replace(inseq) - inseq = inseq:gsub("^[Nn]+", ""):gsub("[Nn]+$", "") - return inseq -end - for l in io.lines() do if string.sub(l,1,1) == '>' then if i > 0 then j = j + 1 i = 0 print(">"..hdr) - thisseq = replace(table.concat(seq,"")) + thisseq = trim_ns(table.concat(seq,"")) print_max_width(thisseq, io.stdout, 60) end hdr = string.sub(l, 2) @@ -57,6 +52,6 @@ for l in io.lines() do end if #seq > 0 then print(">"..hdr) - thisseq = replace(table.concat(seq,"")) + thisseq = trim_ns(table.concat(seq,"")) print_max_width(thisseq, io.stdout, 60) end From 08f9e28a03c948b6bd72007edd71246e822184be Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Fri, 15 Jan 2016 16:57:37 +0000 Subject: [PATCH 7/8] address quirks in ENA validator/converter --- bin/gff3_to_embl.lua | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/bin/gff3_to_embl.lua b/bin/gff3_to_embl.lua index 3fa3c32..98ab005 100755 --- a/bin/gff3_to_embl.lua +++ b/bin/gff3_to_embl.lua @@ -202,6 +202,9 @@ function embl_vis:visit_feature(fn) cnt = cnt + 1 end end + if cnt == 0 then + break + end io.write("FT CDS ") if node:get_strand() == "-" then io.write("complement(") @@ -211,14 +214,26 @@ function embl_vis:visit_feature(fn) end local i = 1 local coding_length = 0 + local start_phase = 0 + local end_phase = 0 for cds in node:get_children() do if cds:get_type() == "CDS" or cds:get_type() == "pseudogenic_exon" then if i == 1 and fn:get_attribute("Start_range") then + if fn:get_strand() == '-' then + end_phase = tonumber(cds:get_phase()) + else + start_phase = tonumber(cds:get_phase()) + end io.write("<") end io.write(cds:get_range():get_start()) io.write("..") if i == cnt and fn:get_attribute("End_range") then + if fn:get_strand() == '-' then + start_phase = tonumber(cds:get_phase()) + else + end_phase = tonumber(cds:get_phase()) + end io.write(">") end io.write(cds:get_range():get_end()) @@ -268,12 +283,31 @@ function embl_vis:visit_feature(fn) -- translation local protseq = nil if node:get_type() == "mRNA" then + cdnaseq = node:extract_sequence("CDS", true, region_mapping) protseq = node:extract_and_translate_sequence("CDS", true, region_mapping) + if embl_compliant and cdnaseq:len() % 3 > 0 then + -- The ENA expects translations for DNA sequences with a length + -- which is not a multiple of three in a different way from the one + -- produced by GenomeTools. While GenomeTools cuts off the + -- remainder and does not translate the partial codon, the ENA + -- validator fills it up with Ns and translates it. This behaviour + -- is emulated here to make the \translation value validate + -- properly. + cdnaseq = cdnaseq .. string.rep('n', 3 - (cdnaseq:len() % 3)) + if start_phase > 0 then + cdnaseq = cdnaseq:sub(start_phase + 1) + end +-- print(">"..geneid .."\n"..protseq) + protseq = gt.translate_dna(cdnaseq) + end + + -- clip off stop codon if protseq:sub(-1,-1) == "*" then protseq = protseq:sub(1,-2) end io.write("FT /translation=\"" .. protseq .."\"\n") + io.write("FT /codon_start=" .. start_phase + 1 .. "\n") end io.write("FT /transl_table=1\n") -- orthologs From 8bc939627230d9931fc4296ac2d9abece366e6fa Mon Sep 17 00:00:00 2001 From: Sascha Steinbiss Date: Wed, 20 Jan 2016 11:00:28 +0000 Subject: [PATCH 8/8] output 'AC *' line in EMBL --- bin/gff3_to_embl.lua | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/gff3_to_embl.lua b/bin/gff3_to_embl.lua index 98ab005..28a52ee 100755 --- a/bin/gff3_to_embl.lua +++ b/bin/gff3_to_embl.lua @@ -166,6 +166,8 @@ function embl_vis:visit_feature(fn) io.write("XX \n") io.write("AC XXX;\n") io.write("XX \n") + io.write("AC * _" .. fn:get_seqid() .. ";\n") + io.write("XX \n") else io.write("ID " .. fn:get_seqid() .. "; SV 1; linear; " .. "genomic DNA; STD; UNC; "