Skip to content

Commit

Permalink
perf: Use pysam directly in stead of pysamstats
Browse files Browse the repository at this point in the history
  • Loading branch information
KHajji committed Apr 14, 2022
1 parent 8fa7e3b commit afd8c0a
Showing 1 changed file with 45 additions and 22 deletions.
67 changes: 45 additions & 22 deletions TrueConsense/indexing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import gffpandas.gffpandas as gffpd
import pandas as pd
import pysam
import pysamstats


def Readbam(f):
Expand All @@ -22,26 +21,50 @@ def Override_index_positions(index, override_data):


def BuildIndex(bamfile, ref):
columns = ["coverage", "A", "T", "C", "G", "X", "I"]
p_index = pd.DataFrame(columns=columns)

for r in pysamstats.stat_pileup(
type="variation",
alignmentfile=Readbam(bamfile),
stepper="nofilter",
fafile=ref,
pad=True,
one_based=True,
max_depth=1000000000,
):
p_index.loc[r["pos"]] = (
[r["reads_all"]]
+ [r["A"]]
+ [r["T"]]
+ [r["C"]]
+ [r["G"]]
+ [r["deletions"]]
+ [r["insertions"]]
)
bamfile = pysam.AlignmentFile(bamfile, "rb")
ref_fasta = pysam.FastaFile(ref)
ref_length = ref_fasta.lengths[0]

pileup = bamfile.pileup(stepper="nofilter", max_depth=10000000, min_base_quality=0)

def parse_query_sequences(l):
coverage = a = c = t = g = x = i = 0
for b in l:
coverage += 1
if b == "*":
x += 1
elif b[0].lower() == "a":
a += 1
elif b[0].lower() == "t":
t += 1
elif b[0].lower() == "c":
c += 1
elif b[0].lower() == "g":
g += 1

# It is important to count the insertions seperately
if "+" in b:
i += 1
return coverage, a, t, c, g, x, i

columns = ["pos", "coverage", "A", "T", "C", "G", "X", "I"]

# 1 Is added to the position because our index starts at 1
p_index = pd.DataFrame(
(
(p.pos + 1,) + parse_query_sequences(p.get_query_sequences(add_indels=True))
for p in pileup
),
columns=columns,
)

# Since the pileup does not return positions without any reads mapped, we have to
# fill these with zeroes
missing_positions = set(range(1, ref_length + 1)) - set(p_index.pos)
missing_p_index = pd.DataFrame(
((i, 0, 0, 0, 0, 0, 0, 0) for i in missing_positions), columns=columns
)
p_index = p_index.append(missing_p_index).set_index("pos").sort_index()
p_index.index.name = None

return p_index

0 comments on commit afd8c0a

Please sign in to comment.