Skip to content

Commit

Permalink
Handle more date/datetime/time formats (#15871)
Browse files Browse the repository at this point in the history
  • Loading branch information
Winand authored and jreback committed Aug 18, 2017
1 parent 0ee1675 commit 24b6349
Show file tree
Hide file tree
Showing 7 changed files with 1,494 additions and 1,447 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ Other Enhancements
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
- :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`)
- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).
- :func:`pd.read_sas()` now recognizes much more of the most frequently used date (datetime) formats in SAS7BDAT files (:issue:`15871`).

.. _whatsnew_0210.api_breaking:

Expand Down
16 changes: 11 additions & 5 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,8 @@ class SAS7BDATReader(BaseIterator):
index : column identifier, defaults to None
Column to use as index.
convert_dates : boolean, defaults to True
Attempt to convert dates to Pandas datetime values. Note all
SAS date formats are supported.
Attempt to convert dates to Pandas datetime values. Note that
some rarely used SAS date formats may be unsupported.
blank_missing : boolean, defaults to True
Convert empty strings to missing values (SAS uses blanks to
indicate missing character variables).
Expand Down Expand Up @@ -655,9 +655,15 @@ def _chunk_to_dataframe(self):
rslt[name] = self._byte_chunk[jb, :].view(
dtype=self.byte_order + 'd')
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
if self.convert_dates and (self.column_formats[j] == "MMDDYY"):
epoch = pd.datetime(1960, 1, 1)
rslt[name] = epoch + pd.to_timedelta(rslt[name], unit='d')
if self.convert_dates:
unit = None
if self.column_formats[j] in const.sas_date_formats:
unit = 'd'
elif self.column_formats[j] in const.sas_datetime_formats:
unit = 's'
if unit:
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
origin="1960-01-01")
jb += 1
elif self.column_types[j] == b's':
rslt[name] = self._string_chunk[js, :]
Expand Down
24 changes: 24 additions & 0 deletions pandas/io/sas/sas_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,27 @@ class index:
b"\xFF\xFF\xFF\xFE": index.columnListIndex,
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": index.columnListIndex,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": index.columnListIndex}


# List of frequently used SAS date and datetime formats
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
sas_date_formats = ("DATE", "DAY", "DDMMYY", "DOWNAME", "JULDAY", "JULIAN",
"MMDDYY", "MMYY", "MMYYC", "MMYYD", "MMYYP", "MMYYS",
"MMYYN", "MONNAME", "MONTH", "MONYY", "QTR", "QTRR",
"NENGO", "WEEKDATE", "WEEKDATX", "WEEKDAY", "WEEKV",
"WORDDATE", "WORDDATX", "YEAR", "YYMM", "YYMMC", "YYMMD",
"YYMMP", "YYMMS", "YYMMN", "YYMON", "YYMMDD", "YYQ",
"YYQC", "YYQD", "YYQP", "YYQS", "YYQN", "YYQR", "YYQRC",
"YYQRD", "YYQRP", "YYQRS", "YYQRN",
"YYMMDDP", "YYMMDDC", "E8601DA", "YYMMDDN", "MMDDYYC",
"MMDDYYS", "MMDDYYD", "YYMMDDS", "B8601DA", "DDMMYYN",
"YYMMDDD", "DDMMYYB", "DDMMYYP", "MMDDYYP", "YYMMDDB",
"MMDDYYN", "DDMMYYC", "DDMMYYD", "DDMMYYS",
"MINGUO")

sas_datetime_formats = ("DATETIME", "DTWKDATX",
"B8601DN", "B8601DT", "B8601DX", "B8601DZ", "B8601LX",
"E8601DN", "E8601DT", "E8601DX", "E8601DZ", "E8601LX",
"DATEAMPM", "DTDATE", "DTMONYY", "DTMONYY", "DTWKDATX",
"DTYEAR", "TOD", "MDYAMPM")
5 changes: 5 additions & 0 deletions pandas/tests/io/sas/data/datetime.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Date1,Date2,DateTime,DateTimeHi,Taiw
1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145226,1912-01-01
1960-01-01,1960-01-01,1960-01-01 00:00:00,1960-01-01 00:00:00.000000,1960-01-01
2016-02-29,2016-02-29,2016-02-29 23:59:59,2016-02-29 23:59:59.123456,2016-02-29
2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854774,2262-04-11
Binary file added pandas/tests/io/sas/data/datetime.sas7bdat
Binary file not shown.
Loading

0 comments on commit 24b6349

Please sign in to comment.