Merge pull request #82 from lee212/master

- gmx parser (_extract_dataframe) using pd.read_csv for performance - close #81
alchemistry · Jun 22, 2019 · 38b33bc · 38b33bc
2 parents 486998f + dbd5a3b
commit 38b33bc
Show file tree

Hide file tree

Showing 2 changed files with 201 additions and 40 deletions.
diff --git a/src/alchemlyb/parsing/gmx.py b/src/alchemlyb/parsing/gmx.py
@@ -2,6 +2,7 @@
 
 """
 import pandas as pd
+import numpy as np
 
 from .util import anyopen
 
@@ -128,10 +129,11 @@ def extract_dHdl(xvg, T):
     """
     beta = 1/(k_b * T)
 
-    state, lambdas, statevec = _extract_state(xvg)
+    headers = _get_headers(xvg)
+    state, lambdas, statevec = _extract_state(xvg, headers)
 
     # extract a DataFrame from XVG data
-    df = _extract_dataframe(xvg)
+    df = _extract_dataframe(xvg, headers)
 
     times = df[df.columns[0]]
 
@@ -186,30 +188,38 @@ def extract_dHdl(xvg, T):
     return dHdl
 
 
-def _extract_state(xvg):
+def _extract_state(xvg, headers=None):
     """Extract information on state sampled, names of lambdas.
 
+    Parameters
+    ----------
+    xvg : str
+        Path to XVG file to extract data from.
+    headers: dict
+       headers dictionary to search header information, reduced I/O by
+       reusing if it is already parsed, e.g. _extract_state and
+       _extract_dataframe in order need one-time header parsing
+
     """
     state = None
-    with anyopen(xvg, 'r') as f:
-        for line in f:
-            if ('subtitle' in line) and ('state' in line):
-                state = int(line.split('state')[1].split(':')[0])
-                lambdas = [word.strip(')(,') for word in line.split() if 'lambda' in word]
-                statevec = eval(line.strip().split(' = ')[-1].strip('"'))
-                break
+    if headers is None:
+        headers = _get_headers(xvg)
+    subtitle = _get_value_by_key(headers, 'subtitle')
+    if subtitle and 'state' in subtitle:
+        state = int(subtitle.split('state')[1].split(':')[0])
+        lambdas = [word.strip(')(,') for word in subtitle.split() if 'lambda' in word]
+        statevec = eval(subtitle.strip().split(' = ')[-1].strip('"'))
 
     # if expanded ensemble data is used the state variable will never be assigned
     # parsing expanded ensemble data
     if state is None:
         lambdas = []
         statevec = []
-        with anyopen(xvg, 'r') as f:
-            for line in f:
-                if ('legend' in line) and ('lambda' in line):
-                    lambdas.append([word.strip(')(,') for word in line.split() if 'lambda' in word][0])
-                if ('legend' in line) and (' to ' in line):
-                    statevec.append(([float(i) for i in line.strip().split(' to ')[-1].strip('"()').split(',')]))
+        for line in headers['_raw_lines']:
+            if ('legend' in line) and ('lambda' in line):
+                lambdas.append([word.strip(')(,') for word in line.split() if 'lambda' in word][0])
+            if ('legend' in line) and (' to ' in line):
+                statevec.append(([float(i) for i in line.strip().split(' to ')[-1].strip('"()').split(',')]))
 
     return state, lambdas, statevec
 
@@ -227,38 +237,189 @@ def _extract_legend(xvg):
     return state_legend
 
 
-def _extract_dataframe(xvg):
-    """Extract a DataFrame from XVG data.
+def _extract_dataframe(xvg, headers=None):
+    """Extract a DataFrame from XVG data using Pandas `read_csv()`.
+
+    pd.read_csv() shows the same behavior building pandas Dataframe with better
+    performance (approx. 2 to 4 times speed up). See Issue #81.
+
+    Parameters
+    ----------
+    xvg: str
+       Path to XVG file to extract data from.
+    headers: dict
+       headers dictionary to search header information, reduced I/O by 
+       reusing if it is already parsed. Direct access by key name
+
+    """
+    if headers is None:
+        headers = _get_headers(xvg)
+    xaxis = _get_value_by_key(headers, 'xaxis', 'label')
+    names = [_get_value_by_key(headers, 's{}'.format(x), 'legend') for x in
+            range(len(headers)) if 's{}'.format(x) in headers]
+    cols = [xaxis] + names
+    header_cnt = len(headers['_raw_lines'])
+    df = pd.read_csv(xvg, sep=r"\s+", header=None, skiprows=header_cnt,
+            na_filter=True, memory_map=True, names=cols, dtype=np.float64,
+            float_precision='high')
+
+    # Dealing with duplicates
+    # Pandas permits unique column names only, mangle_dupe_cols=False re-write
+    # deplicates (not implemented as of 0.24.2)
+    nlen = len(names)
+    nlen_uniq = len(set(names))
+    if nlen > nlen_uniq:
+        df = pd.DataFrame(df, columns=cols)
+
+    return df
+
+
+def _parse_header(line, headers={}, depth=2):
+    """Build python dictionary for single line header
+
+    Update python dictionary to ``headers`` by reading ``line`` separated by
+    whitespace. If ``depth`` is given, at most ``depth`` nested key value store
+    is added. `_val` key is reserved which contain remaining words from
+    ``line``.
+
+    Note
+    ----
+    No return value but 'headers' dictionary will be updated.
+
+    Parameters
+    ----------
+
+    line: str
+        header line to parse
+    headers: dict
+        headers dictionary to update, pass by reference
+    depth: int
+        depth of nested key and value store 
+        
+    Examples
+    --------
+    "x y z" line turns into { 'x': { 'y': {'_val': 'z' }}}
+
+    >>> headers={}
+    >>> _parse_header('@ s0 legend "Potential Energy (kJ/mol)"', headers)
+    >>> headers
+    {'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}}}
+    
+    """
+    # Remove a first character, i.e. @
+    s = line[1:].split(None, 1)
+    next_t = headers[s[0]] = {}
+    for i in range(1, depth):
+        # ord('"') == 34
+        # no further parsing for quoted value
+        if len(s) > 1 and s[1][0] != '"':
+            s = s[1].split(None, 1)
+            next_t[s[0]] = {}
+            next_t = next_t[s[0]]
+        else:
+            break
+
+    next_t["_val"] = ''.join(s[1:]).rstrip().strip('"')
+
+
+def _get_headers(xvg):
+    """Build python dictionary from header lines
+    
+    Build nested key and value store by reading header ('@') lines from a file.
+    Direct access to value provides reduced time complexity O(1).
+    `_raw_lines` key is reserved to keep the original text.
+
+    Example
+    -------
+
+    Given a xvg file containinig header lines like:
+
+        ...
+       @    title "dH/d\\xl\\f{} and \\xD\\f{}H"
+       @    xaxis  label "Time (ps)"
+       @    yaxis  label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"
+       @TYPE xy
+       @ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"
+       @ view 0.15, 0.15, 0.75, 0.85
+       @ legend on
+       @ legend box on
+       @ legend loctype view
+       @ legend 0.78, 0.8
+       @ legend length 2
+       @ s0 legend "Potential Energy (kJ/mol)"
+       @ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"
+       @ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"
+       ...
+
+    >>> _get_headers(xvg)
+    {'TYPE': {'xy': {'_val': ''}},
+      'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'},
+      'title': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H'},
+      'view': {'0.15,': {'_val': '0.15, 0.75, 0.85'}},
+      'xaxis': {'label': {'_val': 'Time (ps)'}},
+      'yaxis': {'label': {'_val': 'dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)'}},
+      ...(omitted)...
+      '_raw_lines': ['@    title "dH/d\\xl\\f{} and \\xD\\f{}H"',
+                    '@    xaxis  label "Time (ps)"',
+                    '@    yaxis  label "dH/d\\xl\\f{} and \\xD\\f{}H (kJ/mol [\\xl\\f{}]\\S-1\\N)"',
+                    '@TYPE xy',
+                    '@ subtitle "T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)"',
+                    '@ view 0.15, 0.15, 0.75, 0.85',
+                    '@ legend on',
+                    '@ legend box on',
+                    '@ legend loctype view',
+                    '@ legend 0.78, 0.8',
+                    '@ legend length 2',
+                    '@ s0 legend "Potential Energy (kJ/mol)"',
+                    '@ s1 legend "dH/d\\xl\\f{} coul-lambda = 0.9500"',
+                    '@ s2 legend "dH/d\\xl\\f{} vdw-lambda = 0.0000"'],
+      }
+
+    Returns
+    -------
+    headers: dict
 
     """
     with anyopen(xvg, 'r') as f:
-        names = []
-        rows = []
+        headers = { '_raw_lines': [] }
         for line in f:
             line = line.strip()
             if len(line) == 0:
                 continue
+            if line.startswith('@'):
+                _parse_header(line, headers)
+                headers['_raw_lines'].append(line)
+            elif line.startswith('#'):
+                headers['_raw_lines'].append(line)
+                continue
+            # assuming to start a body section
+            else:
+                break
 
-            if "label" in line and "xaxis" in line:
-                xaxis = line.split('"')[-2]
+    return headers
 
-            if line.startswith("@ s") and "subtitle" not in line:
-                name = line.split("legend ")[-1].replace('"','').strip()
-                names.append(name)
 
-            # should catch non-numeric lines so we don't proceed in parsing
-            # here
-            if line.startswith(('#', '@')):
-                continue
+def _get_value_by_key(headers, key1, key2=None):
+    """Return value by two-level keys where the second key is optional
 
-            if line.startswith('&'):  #pragma: no cover
-                raise NotImplementedError('{}: Multi-data not supported,'
-                                          'only simple NXY format.'.format(xvg))
-            # parse line as floats
-            row = map(float, line.split())
-            rows.append(row)
+    Example
+    -------
+
+    >>> headers
+    {'s0': {'legend': {'_val': 'Potential Energy (kJ/mol)'}}, 
+            'subtitle': {'_val': 'T = 310 (K) \\xl\\f{} state 38: (coul-lambda,
+                vdw-lambda) = (0.9500, 0.0000)'}}
+    >>> _get_value_by_key(header, 's0','legend')
+    'Potential Energy (kJ/mol)'
+    >>> _get_value_by_key(header, 'subtitle')
+    'T = 310 (K) \\xl\\f{} state 38: (coul-lambda, vdw-lambda) = (0.9500, 0.0000)'
 
-    cols = [xaxis]
-    cols.extend(names)
+    """
+    val = None
+    if key1 in headers:
+        if key2 is not None and key2 in headers[key1]:
+            val = headers[key1][key2]['_val']
+        else:
+            val = headers[key1]['_val']
 
-    return pd.DataFrame(rows, columns=cols)
+    return val
diff --git a/src/alchemlyb/parsing/util.py b/src/alchemlyb/parsing/util.py
@@ -15,7 +15,7 @@
     bz2_open = bz2.BZ2File
 else:
     def bz2_open(filename, mode):
-        mode += 't'
+        mode += 't' if mode in ['r','w','a','x'] else ''
         return bz2.open(filename, mode)
 
 # similar changes need to be made for gzip
@@ -28,7 +28,7 @@ def bz2_open(filename, mode):
     gzip_open = gzip.open
 else:
     def gzip_open(filename, mode):
-        mode += 't'
+        mode += 't' if mode in ['r','w','a','x'] else ''
         return gzip.open(filename, mode)