-
Notifications
You must be signed in to change notification settings - Fork 1
/
fasta_reader.py
135 lines (113 loc) · 4.5 KB
/
fasta_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from diskhash import StructHash
__author__ = 'glazek'
class IndexedFastaReader(object):
def __init__(self, fasta_file, index_file=None, use_mmap=False):
"""
Reader of fasta file indexed with diskhash index.
:param fasta_file: path to fasta file
:param index_file: path to index file (diskhash index). If None, uses (fasta_file + '.dhi')
"""
if index_file is None:
index_file = (fasta_file[:-len('.xz')] if fasta_file.endswith('.xz') else fasta_file) + '.dhi'
self.index = StructHash(index_file, 0, '2l', 'r')
if fasta_file.endswith('.xz'):
if use_mmap:
raise ValueError("use_mmap cannot be used with xz files")
try:
import xz
except ImportError:
raise ImportError("Module xz not found.\n"
"The standard Python lzma module does not support random access.\n\n"
"It can be installed with `pip install python-xz`\n")
self.fasta = xz.open(fasta_file, 'rb')
else:
self.fasta = open(fasta_file, 'rb')
self.use_mmap = use_mmap
if use_mmap:
import mmap
self.data = mmap.mmap(self.fasta.fileno(), 0, access=mmap.ACCESS_READ)
def __del__(self):
self.fasta.close()
def lookup(self, sequence_id):
"""
Lookup coordinates in the index file.
:param sequence_id: sequence identifier
:return: tuple: start position, length (in bases) and a boolean flag
indicating if the sequence includes newline characters.
"""
try:
s, lm = self.index.lookup(sequence_id)
return s, lm >> 1, bool(lm%2)
except TypeError:
return None
def get(self, sequence_id):
"""
Get the sequence corresponding to given sequence identifier.
Note: if you only need the length of the sequence, use get_length method instead.
:param sequence_id: sequence identifier