Skip to content

Commit

Permalink
Merge pull request #134 from widdowquinn/issue_132
Browse files Browse the repository at this point in the history
Fix issue #132 and prepare for 0.2.9 release
  • Loading branch information
widdowquinn authored May 21, 2019
2 parents 2b24ec9 + 8aa145b commit b2e978d
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 31 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ tests/test_input/anib/blastall/*.dataframe
tests/test_*_out/
tests/test_*_output/
tests/test_concordance_*/
tests/test_output
tests/test_JSpecies/C1/
tests/test_JSpecies/C2/
tests/test_JSpecies/C3/
Expand Down Expand Up @@ -57,3 +58,9 @@ docs/
# Development virtualenvs
venv-*
.python-version

# issue-fixing directories
issue*/

# VS Code configs
.vscode
4 changes: 3 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# CHANGES.md

## v0.2.8-dev
## v0.2.9

- fix 1ssue #132: TETRA would fail if not all 4-mers present in all input sequences

## v0.2.8

Expand Down
2 changes: 1 addition & 1 deletion pyani/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# python package version
# should match r"^__version__ = '(?P<version>[^']+)'$" for setup.py
__version__ = "0.2.8"
__version__ = "0.2.9"
67 changes: 38 additions & 29 deletions pyani/tetra.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
import os
import math

from itertools import product

import pandas as pd

from Bio import SeqIO
Expand Down Expand Up @@ -57,19 +59,22 @@ def calculate_tetra_zscore(filename):
# For the Teeling et al. method, the Z-scores require us to count
# mono, di, tri and tetranucleotide sequences - these are stored
# (in order) in the counts tuple
counts = (collections.defaultdict(int), collections.defaultdict(int),
collections.defaultdict(int), collections.defaultdict(int))
for rec in SeqIO.parse(filename, 'fasta'):
for seq in [str(rec.seq).upper(),
str(rec.seq.reverse_complement()).upper()]:
counts = (
collections.defaultdict(int),
collections.defaultdict(int),
collections.defaultdict(int),
collections.defaultdict(int),
)
for rec in SeqIO.parse(filename, "fasta"):
for seq in [str(rec.seq).upper(), str(rec.seq.reverse_complement()).upper()]:
# The Teeling et al. algorithm requires us to consider
# both strand orientations, so monocounts are easy
for base in ('G', 'C', 'T', 'A'):
for base in ("G", "C", "T", "A"):
counts[0][base] += seq.count(base)
# For di, tri and tetranucleotide counts, loop over the
# sequence and its reverse complement, until near the end:
for i in range(len(seq[:-4])):
din, tri, tetra = seq[i:i+2], seq[i:i+3], seq[i:i+4]
din, tri, tetra = seq[i : i + 2], seq[i : i + 3], seq[i : i + 4]
counts[1][str(din)] += 1
counts[2][str(tri)] += 1
counts[3][str(tetra)] += 1
Expand All @@ -83,18 +88,21 @@ def calculate_tetra_zscore(filename):
# tetranucleotide; we ignore ambiguity symbols
tetra_exp = {}
for tet in [tetn for tetn in counts[3] if tetra_clean(tetn)]:
tetra_exp[tet] = 1. * counts[2][tet[:3]] * counts[2][tet[1:]] / \
counts[1][tet[1:3]]
tetra_exp[tet] = (
1.0 * counts[2][tet[:3]] * counts[2][tet[1:]] / counts[1][tet[1:3]]
)
# Following Teeling (2004) we approximate the std dev and Z-score for each
# tetranucleotide
tetra_sd = {}
tetra_z = {}
bases = ["A", "C", "G", "T"]
tetra_z = {"".join(_): 0 for _ in product(bases, bases, bases, bases)}
for tet, exp in list(tetra_exp.items()):
den = counts[1][tet[1:3]]
tetra_sd[tet] = math.sqrt(exp * (den - counts[2][tet[:3]]) *
(den - counts[2][tet[1:]]) / (den * den))
tetra_sd[tet] = math.sqrt(
exp * (den - counts[2][tet[:3]]) * (den - counts[2][tet[1:]]) / (den * den)
)
try:
tetra_z[tet] = (counts[3][tet] - exp)/tetra_sd[tet]
tetra_z[tet] = (counts[3][tet] - exp) / tetra_sd[tet]
except ZeroDivisionError:
# To record if we hit a zero in the estimation of variance
# zeroes = [k for k, v in list(tetra_sd.items()) if v == 0]
Expand All @@ -108,7 +116,7 @@ def tetra_clean(string):
symbols. We are assuming that a low frequency of IUPAC ambiguity
symbols doesn't affect our calculation.
"""
if not len(set(string) - set('ACGT')):
if not len(set(string) - set("ACGT")):
return True
return False

Expand All @@ -128,22 +136,23 @@ def calculate_correlations(tetra_z):
percentage identity.
"""
orgs = sorted(tetra_z.keys())
correlations = pd.DataFrame(index=orgs, columns=orgs,
dtype=float).fillna(1.0)
correlations = pd.DataFrame(index=orgs, columns=orgs, dtype=float).fillna(1.0)
for idx, org1 in enumerate(orgs[:-1]):
for org2 in orgs[idx+1:]:
assert sorted(tetra_z[org1].keys()) == sorted(tetra_z[org2].keys())
for org2 in orgs[idx + 1 :]:
tets = sorted(tetra_z[org1].keys())
zscores = [[tetra_z[org1][t] for t in tets],
[tetra_z[org2][t] for t in tets]]
zmeans = [sum(zscore)/len(zscore) for zscore in zscores]
zdiffs = [[z - zmeans[0] for z in zscores[0]],
[z - zmeans[1] for z in zscores[1]]]
diffprods = sum([zdiffs[0][i] * zdiffs[1][i] for i in
range(len(zdiffs[0]))])
zdiffs2 = [sum([z * z for z in zdiffs[0]]),
sum([z * z for z in zdiffs[1]])]
correlations[org1][org2] = diffprods / \
math.sqrt(zdiffs2[0] * zdiffs2[1])
zscores = [
[tetra_z[org1][t] for t in tets],
[tetra_z[org2][t] for t in tets],
]
zmeans = [sum(zscore) / len(zscore) for zscore in zscores]
zdiffs = [
[z - zmeans[0] for z in zscores[0]],
[z - zmeans[1] for z in zscores[1]],
]
diffprods = sum(
[zdiffs[0][i] * zdiffs[1][i] for i in range(len(zdiffs[0]))]
)
zdiffs2 = [sum([z * z for z in zdiffs[0]]), sum([z * z for z in zdiffs[1]])]
correlations[org1][org2] = diffprods / math.sqrt(zdiffs2[0] * zdiffs2[1])
correlations[org2][org1] = correlations[org1][org2]
return correlations
7 changes: 7 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
black
flake8
nose
pytest
setuptools
twine
wheel
1 change: 1 addition & 0 deletions requirements-pip.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pre-commit

0 comments on commit b2e978d

Please sign in to comment.