Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a function for reading a fasta into a dict of numpy arrays #333

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion allel/io/fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,47 @@ def write_fasta(path, sequences, names, mode='w', width=80):
header = b'>' + name + b'\n'
fasta.write(header)
for i in range(0, sequence.size, width):
line = sequence[i:i+width].tostring() + b'\n'
line = sequence[i:i + width].tostring() + b'\n'
fasta.write(line)


def read_fasta(path):
"""Read nucleotide sequences from a FASTA file and return them as a dictionary
mapping a sequence name to a sequence.

Parameters
----------

path : string
File path.

"""
sequences = []
names = []

current_sequence = []

with open(path, 'r') as fasta:

# Skip any headers and get the first line
for line in fasta:
if line.startswith('>'):
name = line[1:].rstrip()
break

for line in fasta:
if line.startswith('>'):
names.append(name)
sequences.append(np.concatenate(current_sequence))
name = line[1:].rstrip()
current_sequence = []
elif line.startswith(';'):
continue
else:
current_sequence.append(np.array(list(line.strip()), dtype=np.dtype('S1')))
else:
if names:
names.append(name)
sequences.append(np.concatenate(current_sequence))

return dict(zip(names, sequences))
45 changes: 45 additions & 0 deletions allel/test/data/test.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
>crab_anapl ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDITIHNPLIRRPLFSWLAPSRIFDQIFGEHLQESELLPASPSLSPFLMR
SPIFRMPSWLETGLSEMRLEKDKFSVNLDVKHFSPEELKVKVLGDMVEIH
GKHEERQDEHGFIAREFNRKYRIPADVDPLTITSSLSLDGVLTVSAPRKQ
SDVPERSIPITREEKPAIAGAQRK
>crab_bovin ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFPASTSLSPFYLR
PPSFLRAPSWIDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV
HGKHEERQDEHGFISREFHRKYRIPADVDPLAITSSLSSDGVLTVNGPRK
QASGPERTIPITREEKPAVTAAPKK
>crab_chick ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDITIHNPLVRRPLFSWLTPSRIFDQIFGEHLQESELLPTSPSLSPFLMR
SPFFRMPSWLETGLSEMRLEKDKFSVNLDVKHFSPEELKVKVLGDMIEIH
GKHEERQDEHGFIAREFSRKYRIPADVDPLTITSSLSLDGVLTVSAPRKQ
SDVPERSIPITREEKPAIAGSQRK
>crab_human ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFPTSTSLSPFYLR
PPSFLRAPSWFDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV
HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK
QVSGPERTIPITREEKPAVTAAPKK
>crab_mesau ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFSTATSLSPFYLR
PPSFLRAPSWIDTGLSEMRMEKDRFSVNLDVKHFSPEELKVKVLGDVVEV
HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK
QASGPERTIPITREEKPAVTAAPKK
>crab_mouse ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN) (P23).
MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFSTATSLSPFYLR
PPSFLRAPSWIDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV
HGKHEERQDEHGFISREFHRKYRIPADVDPLAITSSLSSDGVLTVNGPRK
QVSGPERTIPITREEKPAVAAAPKK
>crab_rabit ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFPTSTSLSPFYLR
PPSFLRAPSWIDTGLSEMRLEKDRFSVNLDVKHFSPEELKVKVLGDVIEV
HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK
QAPGPERTIPITREEKPAVTAAPKK
>crab_rat ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIHHPWIRRPFFPFHSPSRLFDQFFGEHLLESDLFSTATSLSPFYLR
PPSFLRAPSWIDTGLSEMRMEKDRFSVNLDVKHFSPEELKVKVLGDVIEV
HGKHEERQDEHGFISREFHRKYRIPADVDPLTITSSLSSDGVLTVNGPRK
QASGPERTIPITREEKPAVTAAPKK
>crab_squac ALPHA CRYSTALLIN B CHAIN (ALPHA(B)-CRYSTALLIN).
MDIAIQHPWLRRPLFPSSIFPSRIFDQNFGEHFDPDLFPSFSSMLSPFYW
RMGAPMARMPSWAQTGLSELRLDKDKFAIHLDVKHFTPEELRVKILGDFI
EVQAQHEERQDEHGYVSREFHRKYKVPAGVDPLVITCSLSADGVLTITGP
RKVADVPERSVPISRDEKPAVAGPQQK
10 changes: 10 additions & 0 deletions allel/test/data/test_comments.fa
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
;Comment line
>sequence A
ggtaagtcctctagtacaaacacccccaatattgtgatataattaaaattatattcatat
tctgttgccagaaaaaacacttttaggctatattagagccatcttctttgaagcgttgtc
;Multiple comment lines
;Another comment line
>sequence B
ggtaagtgctctagtacaaacacccccaatattgtgatataattaaaattatattcatat
tctgttgccagattttacacttttaggctatattagagccatcttctttgaagcgttgtc
tatgcatcgatcgacgactg
21 changes: 21 additions & 0 deletions allel/test/io/test_fasta_read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os

from allel.io.fasta import read_fasta


def fixture_path(fn):
return os.path.join(os.path.dirname(__file__), os.pardir, 'data', fn)


def test_fasta_read():
path = fixture_path('test.fa')
data = read_fasta(path)
assert len(data.keys()) == 9


def test_fasta_read_comments():
path = fixture_path('test_comments.fa')
data = read_fasta(path)
assert len(data.keys()) == 2
assert 'sequence A' in data.keys()
assert 'sequence B' in data.keys()