Skip to content

Commit

Permalink
ENH: can pass list of columns to parse_dates, close #853 and add dayf…
Browse files Browse the repository at this point in the history
…irst argument for european dates close #854
  • Loading branch information
wesm committed Apr 15, 2012
1 parent 141df57 commit fc56b64
Show file tree
Hide file tree
Showing 8 changed files with 111 additions and 31 deletions.
10 changes: 6 additions & 4 deletions pandas/core/datetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def parser(x):
data = p_ufunc(arr)
return np.array(data, dtype='M8[us]')

def to_datetime(arg, errors='ignore'):
def to_datetime(arg, errors='ignore', dayfirst=False):
"""
Convert argument to datetime
Expand All @@ -87,14 +87,16 @@ def to_datetime(arg, errors='ignore'):
return arg
elif isinstance(arg, Series):
values = lib.string_to_datetime(com._ensure_object(arg.values),
raise_=errors == 'raise')
raise_=errors == 'raise',
dayfirst=dayfirst)
return Series(values, index=arg.index, name=arg.name)
elif isinstance(arg, np.ndarray):
return lib.string_to_datetime(com._ensure_object(arg),
raise_=errors == 'raise')
raise_=errors == 'raise',
dayfirst=dayfirst)

try:
return parser.parse(arg)
return parser.parse(arg, dayfirst=dayfirst)
except Exception:
if errors == 'raise':
raise
Expand Down
10 changes: 8 additions & 2 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,18 @@ def astype(self, dtype):
return Index(self.values.astype(dtype), name=self.name,
dtype=dtype)

def to_datetime(self):
def to_datetime(self, dayfirst=False):
"""
For an Index containing strings or datetime.datetime objects, attempt
conversion to DatetimeIndex
"""
return DatetimeIndex(self.values)
if self.inferred_type == 'string':
from dateutil.parser import parse
parser = lambda x: parse(x, dayfirst=dayfirst)
parsed = lib.try_parse_dates(self.values, parser=parser)
return DatetimeIndex(parsed)
else:
return DatetimeIndex(self.values)

@property
def dtype(self):
Expand Down
73 changes: 54 additions & 19 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,13 @@
na_values : list-like or dict, default None
Additional strings to recognize as NA/NaN. If dict passed, specific
per-column NA values
parse_dates : boolean, default False
Attempt to parse dates in the index column(s)
parse_dates : boolean or list of column numbers/name, default False
Attempt to parse dates in the indicated columns
date_parser : function
Function to use for converting dates to strings. Defaults to
dateutil.parser
dayfirst : boolean, default False
DD/MM format dates, international and European format
nrows : int, default None
Number of rows of file to read. Useful for reading pieces of large files
iterator : boolean, default False
Expand Down Expand Up @@ -168,6 +170,7 @@ def read_csv(filepath_or_buffer,
skiprows=None,
na_values=None,
parse_dates=False,
dayfirst=False,
date_parser=None,
nrows=None,
iterator=False,
Expand Down Expand Up @@ -195,6 +198,7 @@ def read_table(filepath_or_buffer,
skiprows=None,
na_values=None,
parse_dates=False,
dayfirst=False,
date_parser=None,
nrows=None,
iterator=False,
Expand Down Expand Up @@ -226,6 +230,7 @@ def read_fwf(filepath_or_buffer,
skiprows=None,
na_values=None,
parse_dates=False,
dayfirst=False,
date_parser=None,
nrows=None,
iterator=False,
Expand All @@ -242,7 +247,8 @@ def read_fwf(filepath_or_buffer,
colspecs = kwds.get('colspecs', None)
widths = kwds.pop('widths', None)
if bool(colspecs is None) == bool(widths is None):
raise ValueError("You must specify only one of 'widths' and 'colspecs'")
raise ValueError("You must specify only one of 'widths' and "
"'colspecs'")

# Compute 'colspec' from 'widths', if specified.
if widths is not None:
Expand All @@ -258,8 +264,8 @@ def read_fwf(filepath_or_buffer,

def read_clipboard(**kwargs): # pragma: no cover
"""
Read text from clipboard and pass to read_table. See read_table for the full
argument list
Read text from clipboard and pass to read_table. See read_table for the
full argument list
Returns
-------
Expand Down Expand Up @@ -334,9 +340,9 @@ class TextParser(object):

def __init__(self, f, delimiter=None, names=None, header=0,
index_col=None, na_values=None, parse_dates=False,
date_parser=None, chunksize=None, skiprows=None,
skip_footer=0, converters=None, verbose=False,
encoding=None):
date_parser=None, dayfirst=False, chunksize=None,
skiprows=None, skip_footer=0, converters=None,
verbose=False, encoding=None):
"""
Workhorse function for processing nested list into DataFrame
Expand All @@ -348,12 +354,14 @@ def __init__(self, f, delimiter=None, names=None, header=0,
self.names = list(names) if names is not None else names
self.header = header
self.index_col = index_col
self.parse_dates = parse_dates
self.date_parser = date_parser
self.chunksize = chunksize
self.passed_names = names is not None
self.encoding = encoding

self.parse_dates = parse_dates
self.date_parser = date_parser
self.dayfirst = dayfirst

if com.is_integer(skiprows):
skiprows = range(skiprows)
self.skiprows = set() if skiprows is None else set(skiprows)
Expand Down Expand Up @@ -382,6 +390,10 @@ def __init__(self, f, delimiter=None, names=None, header=0,
else:
self.data = f
self.columns = self._infer_columns()

# get popped off for index
self.orig_columns = list(self.columns)

self.index_name = self._get_index_name()
self._first_chunk = True

Expand Down Expand Up @@ -588,17 +600,19 @@ def get_chunk(self, rows=None):
zipped_content.pop(i)

if np.isscalar(self.index_col):
if self.parse_dates:
index = lib.try_parse_dates(index, parser=self.date_parser)
if self._should_parse_dates(0):
index = lib.try_parse_dates(index, parser=self.date_parser,
dayfirst=self.dayfirst)
index, na_count = _convert_types(index, self.na_values)
index = Index(index, name=self.index_name)
if self.verbose and na_count:
print 'Found %d NA values in the index' % na_count
else:
arrays = []
for arr in index:
if self.parse_dates:
arr = lib.try_parse_dates(arr, parser=self.date_parser)
for i, arr in enumerate(index):
if self._should_parse_dates(i):
arr = lib.try_parse_dates(arr, parser=self.date_parser,
dayfirst=self.dayfirst)
arr, _ = _convert_types(arr, self.na_values)
arrays.append(arr)
index = MultiIndex.from_arrays(arrays, names=self.index_name)
Expand All @@ -623,10 +637,30 @@ def get_chunk(self, rows=None):
col = self.columns[col]
data[col] = lib.map_infer(data[col], f)

if not isinstance(self.parse_dates, bool):
for x in self.parse_dates:
if isinstance(x, int) and x not in data:
x = self.orig_columns[x]
if x in self.index_col or x in self.index_name:
continue
data[x] = lib.try_parse_dates(data[x], parser=self.date_parser,
dayfirst=self.dayfirst)

data = _convert_to_ndarrays(data, self.na_values, self.verbose)

return DataFrame(data=data, columns=self.columns, index=index)

def _should_parse_dates(self, i):
if isinstance(self.parse_dates, bool):
return self.parse_dates
else:
to_parse = self.parse_dates
if np.isscalar(self.index_col):
name = self.index_name
else:
name = self.index_name[i]
return i in to_parse or name in to_parse

def _get_lines(self, rows=None):
source = self.data
lines = self.buf
Expand Down Expand Up @@ -725,7 +759,8 @@ def __init__(self, f, colspecs, filler):
def next(self):
line = self.f.next()
# Note: 'colspecs' is a sequence of half-open intervals.
return [line[fromm:to].strip(self.filler or ' ') for (fromm, to) in self.colspecs]
return [line[fromm:to].strip(self.filler or ' ')
for (fromm, to) in self.colspecs]


class FixedWidthFieldParser(TextParser):
Expand All @@ -743,7 +778,7 @@ def _make_reader(self, f):
self.data = FixedWidthReader(f, self.colspecs, self.delimiter)


#-------------------------------------------------------------------------------
#----------------------------------------------------------------------
# ExcelFile class

_openpyxl_msg = ("\nFor parsing .xlsx files 'openpyxl' is required.\n"
Expand Down Expand Up @@ -795,8 +830,8 @@ def parse(self, sheetname, header=0, skiprows=None, index_col=None,
skiprows : list-like
Row numbers to skip (0-indexed)
index_col : int, default None
Column to use as the row labels of the DataFrame. Pass None if there
is no such column
Column to use as the row labels of the DataFrame. Pass None if
there is no such column
na_values : list-like, default None
List of additional strings to recognize as NA/NaN
Expand Down
25 changes: 25 additions & 0 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,31 @@ def test_parse_dates_implicit_first_col(self):
self.assert_(isinstance(df.index[0], (datetime, np.datetime64, Timestamp)))
assert_frame_equal(df, expected)

def test_parse_dates_column_list(self):
from pandas.core.datetools import to_datetime

data = '''date;destination;ventilationcode;unitcode;units;aux_date
01/01/2010;P;P;50;1;12/1/2011
01/01/2010;P;R;50;1;13/1/2011
15/01/2010;P;P;50;1;14/1/2011
01/05/2010;P;P;50;1;15/1/2011'''

expected = read_csv(StringIO(data), sep=";", index_col=range(4))

lev = expected.index.levels[0]
expected.index.levels[0] = lev.to_datetime(dayfirst=True)
expected['aux_date'] = to_datetime(expected['aux_date'],
dayfirst=True).astype('O')
self.assert_(isinstance(expected['aux_date'][0], datetime))

df = read_csv(StringIO(data), sep=";", index_col = range(4),
parse_dates=[0, 5], dayfirst=True)
assert_frame_equal(df, expected)

df = read_csv(StringIO(data), sep=";", index_col = range(4),
parse_dates=['date', 'aux_date'], dayfirst=True)
assert_frame_equal(df, expected)

def test_no_header(self):
data = """1,2,3,4,5
6,7,8,9,10
Expand Down
6 changes: 3 additions & 3 deletions pandas/src/datetime.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -613,7 +613,7 @@ cdef class DayOffset(_Offset):
# offset.next()
# return i

def string_to_datetime(ndarray[object] strings, raise_=False):
def string_to_datetime(ndarray[object] strings, raise_=False, dayfirst=False):
cdef:
Py_ssize_t i, n = len(strings)
object val
Expand All @@ -634,7 +634,7 @@ def string_to_datetime(ndarray[object] strings, raise_=False):
result[i] = val
else:
try:
result[i] = parse(val)
result[i] = parse(val, dayfirst=dayfirst)
except Exception:
raise TypeError
return result
Expand All @@ -647,7 +647,7 @@ def string_to_datetime(ndarray[object] strings, raise_=False):
oresult[i] = val
else:
try:
oresult[i] = parse(val)
oresult[i] = parse(val, dayfirst=dayfirst)
except Exception:
if raise_:
raise
Expand Down
7 changes: 4 additions & 3 deletions pandas/src/inference.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,8 @@ def maybe_convert_objects(ndarray[object] objects, bint try_float=0,
def convert_sql_column(x):
return maybe_convert_objects(x, try_float=1)

def try_parse_dates(ndarray[object] values, parser=None):
def try_parse_dates(ndarray[object] values, parser=None,
dayfirst=False):
cdef:
Py_ssize_t i, n
ndarray[object] result
Expand All @@ -389,8 +390,8 @@ def try_parse_dates(ndarray[object] values, parser=None):

if parser is None:
try:
from dateutil import parser
parse_date = parser.parse
from dateutil.parser import parse
parse_date = lambda x: parse(x, dayfirst=dayfirst)
except ImportError: # pragma: no cover
def parse_date(s):
try:
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/test_tseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,16 @@ def _ohlc(group):
expected[0] = np.nan
assert_almost_equal(out, expected)

def test_try_parse_dates():
from dateutil.parser import parse

arr = np.array(['5/1/2000', '6/1/2000', '7/1/2000'], dtype=object)

result = lib.try_parse_dates(arr, dayfirst=True)
expected = [parse(d, dayfirst=True) for d in arr]
assert(np.array_equal(result, expected))


class TestTypeInference(unittest.TestCase):

def test_length_zero(self):
Expand Down
1 change: 1 addition & 0 deletions scripts/count_code.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cloc pandas --force-lang=Python,pyx --not-match-f="tseries.c|sandbox.c|engines.c|sparse.c"

0 comments on commit fc56b64

Please sign in to comment.