diff --git a/doc/source/release.rst b/doc/source/release.rst index 03e21a26c9398..5850c1fce55bf 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -147,6 +147,7 @@ Improvements to existing features - perf improvements in DataFrame construction with certain offsets, by removing faulty caching (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) - perf improvements in single-dtyped indexing (:issue:`6484`) +- ``StataWriter`` and ``DataFrame.to_stata`` accept time stamp and data labels (:issue:`6545`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 7c6e6a01cd041..86034c20f63d8 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -312,6 +312,9 @@ Enhancements - ``DataFrame.to_stata`` will now check data for compatibility with Stata data types and will upcast when needed. When it isn't possibly to losslessly upcast, a warning is raised (:issue:`6327`) +- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp + and data_label which allow the time stamp and dataset label to be set when creating a + file. (:issue:`6545`) Performance ~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c02c8abab353..6885ce95a8505 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1216,7 +1216,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def to_stata( self, fname, convert_dates=None, write_index=True, encoding="latin-1", - byteorder=None): + byteorder=None, time_stamp=None, data_label=None): """ A class for writing Stata binary dta files from array-like objects @@ -1247,7 +1247,8 @@ def to_stata( """ from pandas.io.stata import StataWriter writer = StataWriter(fname, self, convert_dates=convert_dates, - encoding=encoding, byteorder=byteorder) + encoding=encoding, byteorder=byteorder, + time_stamp=time_stamp, data_label=data_label) writer.write_file() def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2ecdb22a5cc7b..7d9d272eea1b6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -375,6 +375,18 @@ def __init__(self, encoding): 'd': np.float64(struct.unpack(' strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - self.time_stamp = self.path_or_buf.read(strlen) + self.time_stamp = self._null_terminate(self.path_or_buf.read(strlen)) self.path_or_buf.read(26) # self.path_or_buf.read(8) # 0x0000000000000000 self.path_or_buf.read(8) # position of @@ -543,11 +555,11 @@ def _read_header(self): self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] if self.format_version > 105: - self.data_label = self.path_or_buf.read(81) + self.data_label = self._null_terminate(self.path_or_buf.read(81)) else: - self.data_label = self.path_or_buf.read(32) + self.data_label = self._null_terminate(self.path_or_buf.read(32)) if self.format_version > 104: - self.time_stamp = self.path_or_buf.read(18) + self.time_stamp = self._null_terminate(self.path_or_buf.read(18)) # descriptors if self.format_version > 108: @@ -1029,6 +1041,11 @@ class StataWriter(StataParser): byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` + time_stamp : datetime + A date time to use when writing the file. Can be None, in which + case the current time is used. + dataset_label : str + A label for the data set. Should be 80 characters or smaller. Returns ------- @@ -1047,10 +1064,13 @@ class StataWriter(StataParser): >>> writer.write_file() """ def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None): + encoding="latin-1", byteorder=None, time_stamp=None, + data_label=None): super(StataWriter, self).__init__(encoding) self._convert_dates = convert_dates self._write_index = write_index + self._time_stamp = time_stamp + self._data_label = data_label # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -1086,7 +1106,7 @@ def __iter__(self): if self._write_index: data = data.reset_index() - # Check columns for compatbaility with stata + # Check columns for compatibility with stata data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape @@ -1110,7 +1130,8 @@ def __iter__(self): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header() + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) self._write_descriptors() self._write_variable_labels() # write 5 zeros for expansion fields @@ -1147,7 +1168,7 @@ def _write_header(self, data_label=None, time_stamp=None): # format dd Mon yyyy hh:mm if time_stamp is None: time_stamp = datetime.datetime.now() - elif not isinstance(time_stamp, datetime): + elif not isinstance(time_stamp, datetime.datetime): raise ValueError("time_stamp should be datetime type") self._file.write( self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M")) @@ -1169,7 +1190,9 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, for c in name: if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_': name = name.replace(c, '_') - + # Variable name must not be a reserved word + if name in self.RESERVED_WORDS: + name = '_' + name # Variable name may not start with a number if name[0] > '0' and name[0] < '9': name = '_' + name diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index ac4b9662fc57e..307cd1bd591fb 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1,6 +1,7 @@ # pylint: disable=E1101 from datetime import datetime +import datetime as dt import os import warnings import nose @@ -248,7 +249,7 @@ def test_read_write_dta10(self): original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], - columns=['string', 'object', 'integer', 'float', + columns=['string', 'object', 'integer', 'floating', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' @@ -304,10 +305,20 @@ def test_read_write_dta11(self): def test_read_write_dta12(self): # skip_if_not_little_endian() - original = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) - formatted = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) + original = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_1', + 'astringwithmorethan32characters_2', + '+', + '-', + 'short', + 'delete']) + formatted = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_', + '_0astringwithmorethan32character', + '_', + '_1_', + '_short', + '_delete']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) @@ -376,6 +387,17 @@ def test_read_write_reread_dta15(self): tm.assert_frame_equal(parsed_113, parsed_114) tm.assert_frame_equal(parsed_114, parsed_115) + def test_timestamp_and_label(self): + original = DataFrame([(1,)], columns=['var']) + time_stamp = datetime(2000, 2, 29, 14, 21) + data_label = 'This is a data file.' + with tm.ensure_clean() as path: + original.to_stata(path, time_stamp=time_stamp, data_label=data_label) + reader = StataReader(path) + parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M')) + assert parsed_time_stamp == time_stamp + assert reader.data_label == data_label + if __name__ == '__main__':