forked from Arkarachai/STR-FM
-
Notifications
You must be signed in to change notification settings - Fork 0
/
microsatellite.xml
162 lines (130 loc) · 6.11 KB
/
microsatellite.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
<tool id="microsatellite" name="STR detection" version="1.0.0">
<description>for short read, reference, and mapped data</description>
<command interpreter="python2.7"> microsatellite.py
"${filePath}"
#if $inputFileSource.inputFileType == "fasta"
--fasta
#elif $inputFileSource.inputFileType == "fastq"
--fastq
#elif $inputFileSource.inputFileType == "fastq_noquals"
--fastq:noquals
#elif $inputFileSource.inputFileType == "sam"
--sam
#end if
#if $inputFileSource.inputFileType == "sam"
#if $inputFileSource.referenceFileSource.requireReference
--r --ref="${inputFileSource.referenceFileSource.referencePath}"
#end if
#end if
--period="${period}"
#if $partialmotifs == "true"
--partialmotifs
#end if
--minlength="${minlength}"
--prefix="${prefix}"
--suffix="${surfix}"
--hamming="${hammingThreshold}"
#if $multipleruns
--multipleruns
#end if
#if $flankSetting.noflankdisplay
--noflankdisplay
#else
--flankdisplay=${flankSetting.flankdisplay}
#end if
> $stdout
</command>
<inputs>
<param name="filePath" label="Select input file" type="data"/>
<conditional name="inputFileSource">
<param name="inputFileType" type="select" label="Select input file type">
<option value="fasta">Fasta File</option>
<option value="fastq">Fastq File</option>
<option value="fastq_noquals">Fastq File without Quality Information</option>
<option value="sam">SAM File</option>
</param>
<when value="sam">
<conditional name="referenceFileSource">
<param name="requireReference" label="Do you want to extract correspond microsatellites in reference for comparison?" type="boolean">
</param>
<when value="true">
<param name="referencePath" label="Select reference file" type="data"/>
</when>
</conditional>
</when>
</conditional>
<param name="period" label="Motif size of microsatellites of interest (e.g. Mononucleotide microsatellite =1) (must be less than 10)" type="integer" size="2" value="1"/>
<param name="partialmotifs" label="Consider microsatellites with a partial motif?" type="boolean" checked="True"/>
<param name="minlength" label="Minimal length (bp) of microsatellite sequence reported" type="integer" size="2" value="5"/>
<param name="prefix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
<param name="surfix" label="Do not report candidate repeat intervals that have left flanking region less than (bp):" type="integer" size="4" value="20"/>
<param name="hammingThreshold" label="Hamming threshold of microsatellite, If greater than 0, interrupted microsatellites will also be reported" type="integer" size="2" value="0"/>
<param name="multipleruns" label="Consider all candidate intervals in a sequence. If not check, only the longest one will be considered" type="boolean" checked="True"> </param>
<conditional name="flankSetting">
<param name="noflankdisplay" label="Show the entire flanking regions" type="boolean" checked="True"/>
<when value="false">
<param name="flankdisplay" label="Limit length (bp) of flanking regions shown" type="integer" size="4" value="5"/>
</when>
</conditional>
</inputs>
<outputs>
<data name="stdout" format="tabular"/>
</outputs>
<tests>
<!-- Test data with valid values -->
<test>
<param name="filePath" value="C_sample_fastq"/>
<param name="period" value="1"/>
<param name="inputFileType" value="fastq"/>
<param name="partialmotifs" value="true" />
<param name="minlength" value="3" />
<param name="prefix" value="5"/>
<param name="surfix" value="5"/>
<param name="hammingThreshold" value="0"/>
<param name="multipleruns" value="true"> </param>
<output name="microsatellite" file="C_sample_snoope"/>
</test>
</tests>
<help>
.. class:: infomark
**What it does**
This tool identifies simple as well interrupted STRs. Choosing a hamming distance of zero will return simple STRs.
Choosing a hamming distance of greater than zero will return both simple and interrupted STRs.
The algorithms used to identify simple and interrupted STRs are described oin the manuscript cited below (see TABLE XXXX).
**Citation**
When you use this tool, please cite **Fungtammasan A, Ananda G, Hile SE, Su MS, Sun C, Harris R, Medvedev P, Eckert K, Makova KD. 2015. Accurate Typing of Short Tandem Repeats from Genome-wide Sequencing Data and its Applications, Genome Research**
This tool is developed by Chen Sun (cxs1031@cse.psu.edu) and Bob Harris (rsharris@bx.psu.edu)
**Input**
- The input files can be fastq, fasta, fastq without quality score, and SAM format.
**Output**
For fastq, the output will contain the following columns:
- Column 1 = length of STR (bp)
- Column 2 = length of left flanking region (bp)
- Column 3 = length of right flanking region (bp)
- Column 4 = repeat motif (bp)
- Column 5 = hamming distance
- Column 6 = read name
- Column 7 = read sequence with soft masking of STR
- Column 8 = read quality (the same Phred score scale as input)
For fasta, fastq without quality score and sam format, column 8 will be replaced with dot(.).
If the users have mapped file (SAM) and would like to profile STRs from premapped data instead of using flank-based mapping approach, they can select SAM format input and specify that they want correspond STRs in reference for comparison. The output will be as follow:
- Column 1 = length of STR (bp)
- Column 2 = length of left flanking region (bp)
- Column 3 = length of right flanking region (bp)
- Column 4 = repeat motif (bp)
- Column 5 = hamming distance
- Column 6 = read name
- Column 7 = read sequence with soft masking of STR
- Column 8 = read quality (the same Phred score scale as input)
- Column 9 = read name (The same as column 6)
- Column 10 = chromosome
- Column 11 = left flanking region start
- Column 12 = left flanking region stop
- Column 13 = STR start as infer from pair-end
- Column 14 = STR stop as infer from pair-end
- Column 15 = right flanking region start
- Column 16 = right flanking region stop
- Column 17 = STR length in reference
- Column 18 = STR sequence in reference
</help>
</tool>