From 8316a1e52f3680c51011347bea2332dc382979b5 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 27 Nov 2012 23:09:35 -0500 Subject: [PATCH] BUG: raise OverflowError on integer values exceeding int64 precision in parsers. close #2247 --- RELEASE.rst | 1 + pandas/io/tests/test_parsers.py | 17 +++++++++++++++++ pandas/src/parser.pyx | 23 ++++++++++++++++++++--- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/RELEASE.rst b/RELEASE.rst index 273e9615f3103..8b5243e767ab5 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -79,6 +79,7 @@ pandas 0.10.0 - Enable joins between MultiIndex and regular Index (#2024) - Fix time zone metadata issue when unioning non-overlapping DatetimeIndex objects (#2367) + - Raise/handle int64 overflows in parsers (#2247) pandas 0.9.1 ============ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index cb02506e86979..7b5c59b71b247 100644 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -34,6 +34,8 @@ parse_date_time, parse_date_fields, parse_all_fields ) +from pandas._parser import OverflowError + def _skip_if_no_xlrd(): try: import xlrd @@ -1898,6 +1900,21 @@ def test_disable_bool_parsing(self): result = read_csv(StringIO(data), dtype=object, na_filter=False) self.assertEquals(result['B'][2], '') + def test_int64_overflow(self): + data = """ID +00013007854817840016671868 +00013007854817840016749251 +00013007854817840016754630 +00013007854817840016781876 +00013007854817840017028824 +00013007854817840017963235 +00013007854817840018860166""" + + result = read_csv(StringIO(data)) + self.assertTrue(result['ID'].dtype == object) + + self.assertRaises(OverflowError, read_csv, StringIO(data), + dtype='i8') class TestParseSQL(unittest.TestCase): diff --git a/pandas/src/parser.pyx b/pandas/src/parser.pyx index ebb04d1876ef4..d708ef2863113 100644 --- a/pandas/src/parser.pyx +++ b/pandas/src/parser.pyx @@ -76,6 +76,8 @@ cdef extern from "parser/tokenizer.h": EAT_COMMENT FINISHED + enum: ERROR_OVERFLOW + ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read, int *status) ctypedef int (*io_cleanup)(void *src) @@ -840,9 +842,13 @@ cdef class TextReader: else: col_res = None for dt in dtype_cast_order: - col_res, na_count = self._convert_with_dtype(dt, i, start, - end, na_filter, - na_hashset) + try: + col_res, na_count = self._convert_with_dtype( + dt, i, start, end, na_filter, na_hashset) + except OverflowError: + col_res, na_count = self._convert_with_dtype( + '|O8', i, start, end, na_filter, na_hashset) + if col_res is not None: break @@ -966,6 +972,11 @@ cdef class TextReader: class CParserError(Exception): pass + +class OverflowError(ValueError): + pass + + def _ensure_encoded(list lst): cdef list result = [] for x in lst: @@ -1251,6 +1262,7 @@ cdef _try_double(parser_t *parser, int col, int line_start, int line_end, return result, na_count + cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, bint na_filter, kh_str_t *na_hashset): cdef: @@ -1283,6 +1295,9 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: + if error == ERROR_OVERFLOW: + raise OverflowError(word) + return None, None else: for i in range(lines): @@ -1290,6 +1305,8 @@ cdef _try_int64(parser_t *parser, int col, int line_start, int line_end, data[i] = str_to_int64(word, INT64_MIN, INT64_MAX, &error, parser.thousands) if error != 0: + if error == ERROR_OVERFLOW: + raise OverflowError(word) return None, None return result, na_count