Skip to content
This repository has been archived by the owner on Nov 7, 2021. It is now read-only.

ENA validation and ID assignment #46

Merged
merged 4 commits into from
Jan 28, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions annot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -788,10 +788,18 @@ process add_polypeptides {
output:
file 'output.gff3' into genemodels_with_polypeptides_gff3

"""
create_polypeptides.lua input.gff3 ${params.GENOME_PREFIX} \
"${params.CHR_PATTERN}" | gt gff3 -sort -retainids -tidy > output.gff3
"""
script:
if (params.alphanumeric_ids)
"""
create_polypeptides.lua -r input.gff3 ${params.GENOME_PREFIX} \
"${params.CHR_PATTERN}" | gt gff3 -sort -retainids -tidy > output.gff3
"""
else
"""
create_polypeptides.lua input.gff3 ${params.GENOME_PREFIX} \
"${params.CHR_PATTERN}" | gt gff3 -sort -retainids -tidy > output.gff3
"""

}

// TMHMM
Expand Down
150 changes: 123 additions & 27 deletions bin/create_polypeptides.lua
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

--[[
Author: Sascha Steinbiss <ss34@sanger.ac.uk>
Copyright (c) 2014 Genome Research Ltd
Copyright (c) 2014-2016 Genome Research Ltd

Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
Expand All @@ -17,24 +17,137 @@
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
]]

package.path = gt.script_dir .. "/?.lua;" .. package.path
require("lib")
require("optparse")

default_length = 4
default_increment = 100
default_startval = 5000
default_seed = os.time()

op = OptionParser:new({usage="%prog [options] <GFF with CDS annotations> <prefix> <chromosome pattern>",
oneliner="Adds polypeptide features and systematic IDs to raw gene features.",
version="0.1"})
op:option{"-r", action='store_true', dest='random_alphanumeric',
help="use random alphanumeric locus_tags/geneIDs"}
op:option{"-l", action='store', dest='random_length', default=default_length,
help="length of random alphanumeric IDs"}
op:option{"-i", action='store', dest='increment', default=default_increment,
help="increment between consecutive gene IDs"}
op:option{"-s", action='store', dest='startval', default=default_startval,
help="start value per chromosome for gene IDs"}
op:option{"-e", action='store', dest='seed', default=default_seed,
help="seed value for PRNG"}
options,args = op:parse({random_alphanumeric = false,
random_length = tonumber(default_length),
increment = tonumber(default_increment),
startval = tonumber(default_startval),
seed = tonumber(default_seed)})

function usage()
io.stderr:write(string.format("Usage: %s <GFF with CDS annotations> <prefix> <chromosome pattern>\n" , arg[0]))
op:help()
os.exit(1)
end

if #arg < 3 then
if #args ~= 3 then
usage()
os.exit(1)
end

package.path = gt.script_dir .. "/?.lua;" .. package.path
require("lib")
-- implementation for numeric gene IDs, with chromosome numbers
IDProviderNumeric = {}
IDProviderNumeric.__index = IDProviderNumeric
function IDProviderNumeric.new(startval, increment, prefix, chr_pattern)
idp = {}
setmetatable(idp, IDProviderNumeric)
idp.startval = startval
idp.increment = increment
idp.prefix = prefix
idp.chr_pattern = chr_pattern
idp.numbers = {}
return idp
end
function IDProviderNumeric:next_for_feature(feature)
local seqid = feature:get_seqid()
local m = seqid:match(self.chr_pattern)
local chr = '00'
if m then
if tonumber(m) then
chr = string.format("%02d", tonumber(m))
else
chr = m
end
end
assert(chr)
if not self.numbers[chr] then
self.numbers[chr] = startval
end
local rval = string.format("%s_%s%07d", self.prefix, chr, self.numbers[chr])
self.numbers[chr] = self.numbers[chr] + increment
return rval
end

increment = 100
-- implementation for random alphanumeric IDs
IDProviderRandomAlpha = {}
IDProviderRandomAlpha.__index = IDProviderRandomAlpha
function IDProviderRandomAlpha.new(prefix, length, seed)
idp = {}
setmetatable(idp, IDProviderRandomAlpha)
idp.length = length
idp.alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
idp.maxcount = math.pow(idp.alpha:len(), length)
idp.ids = {}
idp.count = 0
idp.prefix = prefix
math.randomseed(seed)
return idp
end
function IDProviderRandomAlpha:next_for_feature(feature)
local id = ""
if self.count + 1 > self.maxcount then
error("can't handle more than " .. self.maxcount
.. " distinct IDs with length " .. self.length)
end
-- This is just sample+test, but should still be fast enough for typical
-- feature numbers as |idp.alpha|^4 (default) is 1679616 already. If we get
-- annotations with more features, we have more serious problems anyway.
while id == "" or self.ids[id] do
id = ""
for i = 1, self.length do
local pos = math.random(1, self.alpha:len())
id = id .. self.alpha:sub(pos, pos)
end
end
local rval = string.format("%s_%s", self.prefix, id)
self.ids[id] = true
self.count = self.count + 1
return rval
end

increment = tonumber(options.increment)
if increment < 1 then
error("increment value must be >0")
end
startval = tonumber(options.startval)
if startval < 0 then
error("startval value must be >0")
end
if options.random_length and options.random_alphanumeric then
random_length = tonumber(options.random_length)
if random_length < 4 then
error("startval value must be >3")
end
end

cv = gt.custom_visitor_new()
if options.random_alphanumeric then
cv.id_provider = IDProviderRandomAlpha.new(args[2],
options.random_length,
options.seed)
else
cv.id_provider = IDProviderNumeric.new(startval, increment, args[2], args[3])
end
cv.queue = {}
cv.numbers = {}
cv.suffixes = {}
cv.suffixes.gene = ''
function cv:visit_feature(fn)
Expand All @@ -43,23 +156,7 @@ function cv:visit_feature(fn)
if fn:get_type() == "gap" or fn:get_type() == "contig" then
return 0
end
local seqid = fn:get_seqid()
-- default for 'bin' seqs
local chr = "00"
m = seqid:match(arg[3])
if m then
if tonumber(m) then
chr = string.format("%02d", tonumber(m))
else
chr = m
end
end
-- initialize counter for feature numbers
if not self.numbers[chr] then
self.numbers[chr] = 5000
end
-- determine chromosome number
local base_id = string.format("%s_%s%07d", arg[2], chr, self.numbers[chr])
local base_id = self.id_provider:next_for_feature(fn)
for n in fn:get_children() do
local this_id = base_id
--delete "Name"s
Expand Down Expand Up @@ -138,13 +235,12 @@ function cv:visit_feature(fn)
c:remove_attribute("_done")
end
end
self.numbers[chr] = self.numbers[chr] + increment
return 0
end

vis_stream = gt.custom_stream_new_unsorted()
vis_stream.queue = cv.queue
vis_stream.instream = gt.gff3_in_stream_new_sorted(arg[1])
vis_stream.instream = gt.gff3_in_stream_new_sorted(args[1])
vis_stream.visitor = cv
function vis_stream:next_tree()
if table.getn(self.queue) > 0 then
Expand Down
12 changes: 11 additions & 1 deletion bin/gff3_to_embl.lua
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

--[[
Author: Sascha Steinbiss <ss34@sanger.ac.uk>
Copyright (c) 2014-2015 Genome Research Ltd
Copyright (c) 2014-2016 Genome Research Ltd

Permission to use, copy, modify, and distribute this software for any
purpose with or without fee is hereby granted, provided that the above
Expand Down Expand Up @@ -288,6 +288,16 @@ function embl_vis:visit_feature(fn)
cdnaseq = node:extract_sequence("CDS", true, region_mapping)
protseq = node:extract_and_translate_sequence("CDS", true,
region_mapping)
local startcodon = cdnaseq:sub(start_phase + 1, start_phase + 3):lower()
if embl_compliant and startcodon:match('^[tc]tg$') then
-- The ENA expects non-M start codons (e.g. Ls) to be represented as
-- Ms in the literal translation (according to their validator
-- output). Emulating this behaviour here, diverging from the actual
-- translation table.
if protseq:sub(1,1):upper() == 'L' then
protseq = 'M' .. protseq:sub(2)
end
end
if embl_compliant and cdnaseq:len() % 3 > 0 then
-- The ENA expects translations for DNA sequences with a length
-- which is not a multiple of three in a different way from the one
Expand Down
Loading