Skip to content

Commit

Permalink
Merge pull request #82 from lee212/master
Browse files Browse the repository at this point in the history
- gmx parser (_extract_dataframe) using pd.read_csv for performance
- close #81
  • Loading branch information
orbeckst authored Jun 22, 2019
2 parents 486998f + dbd5a3b commit 38b33bc
Show file tree
Hide file tree
Showing 2 changed files with 201 additions and 40 deletions.
237 changes: 199 additions & 38 deletions src/alchemlyb/parsing/gmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
import pandas as pd
import numpy as np

from .util import anyopen

Expand Down Expand Up @@ -128,10 +129,11 @@ def extract_dHdl(xvg, T):
"""
beta = 1/(k_b * T)

state, lambdas, statevec = _extract_state(xvg)
headers = _get_headers(xvg)
state, lambdas, statevec = _extract_state(xvg, headers)

# extract a DataFrame from XVG data
df = _extract_dataframe(xvg)
df = _extract_dataframe(xvg, headers)

times = df[df.columns[0]]

Expand Down Expand Up @@ -186,30 +188,38 @@ def extract_dHdl(xvg, T):
return dHdl


def _extract_state(xvg):
def _extract_state(xvg, headers=None):
"""Extract information on state sampled, names of lambdas.
Parameters
----------
xvg : str
Path to XVG file to extract data from.
headers: dict
headers dictionary to search header information, reduced I/O by
reusing if it is already parsed, e.g. _extract_state and
_extract_dataframe in order need one-time header parsing
"""
state = None
with anyopen(xvg, 'r') as f:
for line in f:
if ('subtitle' in line) and ('state' in line):
state = int(line.split('state')[1].split(':')[0])
lambdas = [word.strip(')(,') for word in line.split() if 'lambda' in word]
statevec = eval(line.strip().split(' = ')[-1].strip('"'))
break
if headers is None:
headers = _get_headers(xvg)
subtitle = _get_value_by_key(headers, 'subtitle')
if subtitle and 'state' in subtitle:
state = int(subtitle.split('state')[1].split(':')[0])
lambdas = [word.strip(')(,') for word in subtitle.split() if 'lambda' in word]
statevec = eval(subtitle.strip().split(' = ')[-1].strip('"'))

# if expanded ensemble data is used the state variable will never be assigned
# parsing expanded ensemble data
if state is None:
lambdas = []
statevec = []
with anyopen(xvg, 'r') as f:
for line in f:
if ('legend' in line) and ('lambda' in line):
lambdas.append([word.strip(')(,') for word in line.split() if 'lambda' in word][0])
if ('legend' in line) and (' to ' in line):
statevec.append(([float(i) for i in line.strip().split(' to ')[-1].strip('"()').split(',')]))
for line in headers['_raw_lines']:
if ('legend' in line) and ('lambda' in line):
lambdas.append([word.strip(')(,') for word in line.split() if 'lambda' in word][0])
if ('legend' in line) and (' to ' in line):
statevec.append(([float(i) for i in line.strip().split(' to ')[-1].strip('"()').split(',')]))

return state, lambdas, statevec

Expand All @@ -227,38 +237,189 @@ def _extract_legend(xvg):
return state_legend


def _extract_dataframe(xvg):
"""Extract a DataFrame from XVG data.
def _extract_dataframe(xvg, headers=None):
"""Extract a DataFrame from XVG data using Pandas `read_csv()`.
pd.read_csv() shows the same behavior building pandas Dataframe with better
performance (approx. 2 to 4 times speed up). See Issue #81.
Parameters
----------
xvg: str
Path to XVG file to extract data from.
headers: dict
headers dictionary to search header information, reduced I/O by
reusing if it is already parsed. Direct access by key name
"""
if headers is None:
headers = _get_headers(xvg)
xaxis = _get_value_by_key(headers, 'xaxis', 'label')
names = [_get_value_by_key(headers, 's{}'.format(x), 'legend') for x in
range(len(headers)) if 's{}'.format(x) in headers]
cols = [xaxis] + names
header_cnt = len(headers['_raw_lines'])
df = pd.read_csv(xvg, sep=r"\s+", header=None, skiprows=header_cnt,
na_filter=True, memory_map=True, names=cols, dtype=np.float64,
float_precision='high')

# Dealing with duplicates
# Pandas permits unique column names only, mangle_dupe_cols=False re-write
# deplicates (not implemented as of 0.24.2)
nlen = len(names)
nlen_uniq = len(set(names))
if nlen > nlen_uniq:
df = pd.DataFrame(df, columns=cols)

return df


def _parse_header(line, headers={}, depth=2):
"""Build python dictionary for single line header
Update python dictionary to ``headers`` by reading ``line`` separated by
whitespace. If ``depth`` is given, at most ``depth`` nested key value store
is added. `_val` key is reserved which contain remaining words from
``line``.
Note
----
No return value but 'headers' dictionary will be updated.
Parameters
----------
line: str
header line to parse
headers: dict
headers dictionary to update, pass by reference
depth: int
depth of nested key and value store
Examples
--------
"x y z" line turns into { 'x': { 'y': {'_val': 'z' }}}
>>> headers={}
>>> _parse_header('@ s0 legend "Potential Energy (kJ/mol)"', headers)
>>> headers
{'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}}}
"""
# Remove a first character, i.e. @
s = line[1:].split(None, 1)
next_t = headers[s[0]] = {}
for i in range(1, depth):
# ord('"') == 34
# no further parsing for quoted value
if len(s) > 1 and s[1][0] != '"':
s = s[1].split(None, 1)
next_t[s[0]] = {}
next_t = next_t[s[0]]
else:
break

next_t["_val"] = ''.join(s[1:]).rstrip().strip('"')


def _get_headers(xvg):
"""Build python dictionary from header lines
Build nested key and value store by reading header ('@') lines from a file.
Direct access to value provides reduced time complexity O(1).
`_raw_lines` key is reserved to keep the original text.
Example
-------
Given a xvg file containinig header lines like:
...
@ title "dH/d\\xl\\f{} and \\xD\\f{}H"
@ xaxis label "Time (ps)"
@ yaxis label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"
@TYPE xy
@ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"
@ view 0.15, 0.15, 0.75, 0.85
@ legend on
@ legend box on
@ legend loctype view
@ legend 0.78, 0.8
@ legend length 2
@ s0 legend "Potential Energy (kJ/mol)"
@ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"
@ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"
...
>>> _get_headers(xvg)
{'TYPE': {'xy': {'_val': ''}},
'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'},
'title': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H'},
'view': {'0.15,': {'_val': '0.15, 0.75, 0.85'}},
'xaxis': {'label': {'_val': 'Time (ps)'}},
'yaxis': {'label': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)'}},
...(omitted)...
'_raw_lines': ['@ title "dH/d\\xl\\f{} and \\xD\\f{}H"',
'@ xaxis label "Time (ps)"',
'@ yaxis label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"',
'@TYPE xy',
'@ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"',
'@ view 0.15, 0.15, 0.75, 0.85',
'@ legend on',
'@ legend box on',
'@ legend loctype view',
'@ legend 0.78, 0.8',
'@ legend length 2',
'@ s0 legend "Potential Energy (kJ/mol)"',
'@ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"',
'@ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"'],
}
Returns
-------
headers: dict
"""
with anyopen(xvg, 'r') as f:
names = []
rows = []
headers = { '_raw_lines': [] }
for line in f:
line = line.strip()
if len(line) == 0:
continue
if line.startswith('@'):
_parse_header(line, headers)
headers['_raw_lines'].append(line)
elif line.startswith('#'):
headers['_raw_lines'].append(line)
continue
# assuming to start a body section
else:
break

if "label" in line and "xaxis" in line:
xaxis = line.split('"')[-2]
return headers

if line.startswith("@ s") and "subtitle" not in line:
name = line.split("legend ")[-1].replace('"','').strip()
names.append(name)

# should catch non-numeric lines so we don't proceed in parsing
# here
if line.startswith(('#', '@')):
continue
def _get_value_by_key(headers, key1, key2=None):
"""Return value by two-level keys where the second key is optional
if line.startswith('&'): #pragma: no cover
raise NotImplementedError('{}: Multi-data not supported,'
'only simple NXY format.'.format(xvg))
# parse line as floats
row = map(float, line.split())
rows.append(row)
Example
-------
>>> headers
{'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}},
'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda,
vdw-lambda) = (0.9500, 0.0000)'}}
>>> _get_value_by_key(header, 's0','legend')
'Potential Energy (kJ/mol)'
>>> _get_value_by_key(header, 'subtitle')
'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'
cols = [xaxis]
cols.extend(names)
"""
val = None
if key1 in headers:
if key2 is not None and key2 in headers[key1]:
val = headers[key1][key2]['_val']
else:
val = headers[key1]['_val']

return pd.DataFrame(rows, columns=cols)
return val
4 changes: 2 additions & 2 deletions src/alchemlyb/parsing/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
bz2_open = bz2.BZ2File
else:
def bz2_open(filename, mode):
mode += 't'
mode += 't' if mode in ['r','w','a','x'] else ''
return bz2.open(filename, mode)

# similar changes need to be made for gzip
Expand All @@ -28,7 +28,7 @@ def bz2_open(filename, mode):
gzip_open = gzip.open
else:
def gzip_open(filename, mode):
mode += 't'
mode += 't' if mode in ['r','w','a','x'] else ''
return gzip.open(filename, mode)


Expand Down

0 comments on commit 38b33bc

Please sign in to comment.