diff --git a/RELEASE.rst b/RELEASE.rst index 1f5bd2591470b..8a3ab284b26a6 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -33,6 +33,7 @@ pandas 0.11.1 - pd.read_html() can now parse HTML string, files or urls and return dataframes courtesy of @cpcloud. (GH3477_) - Support for reading Amazon S3 files. (GH3504_) + - Added module for reading and writing Stata files: pandas.io.stata (GH1512_) **Improvements to existing features** @@ -166,6 +167,7 @@ pandas 0.11.1 .. _GH3610: https://github.com/pydata/pandas/issues/3610 .. _GH3596: https://github.com/pydata/pandas/issues/3596 .. _GH3435: https://github.com/pydata/pandas/issues/3435 +.. _GH1512: https://github.com/pydata/pandas/issues/1512 pandas 0.11.0 diff --git a/doc/source/io.rst b/doc/source/io.rst index 39f860c63e0e6..d390f46fcd39d 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -1829,3 +1829,44 @@ There are a few other available functions: For now, writing your DataFrame into a database works only with **SQLite**. Moreover, the **index** will currently be **dropped**. + + +Reading from STATA format +~~~~~~~~~~~~~~~~~~~~~~ + +.. _io.StataReader: + +.. versionadded:: 0.11.1 + +The class StataReader will read the header of the given dta file at +initialization. Its function :func:'~pandas.io.StataReader.data' will +read the observations, converting them to a DataFrame which is returned: + +.. ipython:: python + reader = StataReader(dta_filepath) + dataframe = reader.data() + +The parameter convert_categoricals indicates wheter value labels should be +read and used to create a Categorical variable from them. Value labels can +also be retrieved by the function variable_labels, which requires data to be +called before. +The StataReader supports .dta Formats 104, 105, 108, 113-115. + +Alternatively, the function :func:'~pandas.io.read_stata' can be used: + +.. ipython:: python + dataframe = read_stata(dta_filepath) + + +Writing to STATA format +~~~~~~~~~~~~~~~~~~~~~~ + +.. _io.StataWriter: + +The function :func:'~pandas.io.StataWriter.write_file' will write a DataFrame +into a .dta file. The format version of this file is always the latest one, +115. + +.. ipython:: python + writer = StataWriter(filename, dataframe) + writer.write_file() diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index c89118298a675..4d983905f9aaa 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -54,6 +54,7 @@ Enhancements - support datelike columns with a timezone as data_columns (GH2852_) - ``fillna`` methods now raise a ``TypeError`` if the ``value`` parameter is a list or tuple. + - Added module for reading and writing Stata files: pandas.io.stata (GH1512_) See the `full release notes `__ or issue tracker @@ -68,3 +69,4 @@ on GitHub for a complete list. .. _GH3596: https://github.com/pydata/pandas/issues/3596 .. _GH3590: https://github.com/pydata/pandas/issues/3590 .. _GH3435: https://github.com/pydata/pandas/issues/3435 +.. _GH1512: https://github.com/pydata/pandas/issues/1512 diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c1f2f38dabd8b..73f789a9425c6 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1280,6 +1280,35 @@ def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=parse_dates, index_col=index_col, encoding=encoding) + @classmethod + def from_dta(dta, path, parse_dates=True, convert_categoricals=True, encoding=None, index_col=None): + """ + Read Stata file into DataFrame + + Parameters + ---------- + path : string file path or file handle / StringIO + parse_dates : boolean, default True + Convert date variables to DataFrame time values + convert_categoricals : boolean, default True + Read value labels and convert columns to Categorical/Factor variables + encoding : string, None or encoding, default None + Encoding used to parse the files. Note that Stata doesn't + support unicode. None defaults to cp1252. + index_col : int or sequence, default None + Column to use for index. If a sequence is given, a MultiIndex + is used. Different default from read_table + + Notes + ----- + + Returns + ------- + y : DataFrame + """ + from pandas.io.stata import read_stata + return read_stata(path, parse_dates=parse_dates, convert_categoricals=convert_categoricals, encoding=encoding, index=index_col) + def to_sparse(self, fill_value=None, kind='block'): """ Convert to SparseDataFrame diff --git a/pandas/io/stata.py b/pandas/io/stata.py new file mode 100644 index 0000000000000..3fc246c2ffbc7 --- /dev/null +++ b/pandas/io/stata.py @@ -0,0 +1,911 @@ +""" +Module contains tools for processing Stata files into DataFrames + +The StataReader below was originally written by Joe Presbrey as part of PyDTA. +It has been extended and improved by Skipper Seabold from the Statsmodels project +who also developed the StataWriter and was finally added to pandas in an once again +improved version. + +You can find more information on http://presbrey.mit.edu/PyDTA and +http://statsmodels.sourceforge.net/devel/ +""" + +from StringIO import StringIO +import numpy as np + +import sys +import struct +from pandas.core.frame import DataFrame +from pandas.core.series import Series +from pandas.core.categorical import Categorical +import datetime +from pandas.util import py3compat +from pandas import isnull +from pandas.io.parsers import _parser_params, _is_url, Appender + + +_read_stata_doc = """ +Read Stata file into DataFrame + +%s +""" % (_parser_params) + + +@Appender(_read_stata_doc) +def read_stata(filepath_or_buffer, convert_dates=True, convert_categoricals=True, encoding=None, index=None): + reader = StataReader(filepath_or_buffer, encoding) + + return reader.data(convert_dates, convert_categoricals, index) + + +_date_formats = ["%tc", "%tC", "%td", "%tw", "%tm", "%tq", "%th", "%ty"] + + +def _stata_elapsed_date_to_datetime(date, fmt): + """ + Convert from SIF to datetime. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + date : int + The Stata Internal Format date to convert to datetime according to fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + + Examples + -------- + >>> _stata_elapsed_date_to_datetime(52, "%tw") datetime.datetime(1961, 1, 1, 0, 0) + + Notes + ----- + datetime/c - tc + milliseconds since 01jan1960 00:00:00.000, assuming 86,400 s/day + datetime/C - tC - NOT IMPLEMENTED + milliseconds since 01jan1960 00:00:00.000, adjusted for leap seconds + date - td + days since 01jan1960 (01jan1960 = 0) + weekly date - tw + weeks since 1960w1 + This assumes 52 weeks in a year, then adds 7 * remainder of the weeks. + The datetime value is the start of the week in terms of days in the + year, not ISO calendar weeks. + monthly date - tm + months since 1960m1 + quarterly date - tq + quarters since 1960q1 + half-yearly date - th + half-years since 1960h1 yearly + date - ty + years since 0000 + + If you don't have pandas with datetime support, then you can't do + milliseconds accurately. + """ + #NOTE: we could run into overflow / loss of precision situations here + # casting to int, but I'm not sure what to do. datetime won't deal with + # numpy types and numpy datetime isn't mature enough / we can't rely on + # pandas version > 0.7.1 + #TODO: IIRC relative delta doesn't play well with np.datetime? + if np.isnan(date): + return np.datetime64('nat') + + date = int(date) + stata_epoch = datetime.datetime(1960, 1, 1) + if fmt in ["%tc", "tc"]: + from dateutil.relativedelta import relativedelta + return stata_epoch + relativedelta(microseconds=date * 1000) + elif fmt in ["%tC", "tC"]: + from warnings import warn + warn("Encountered %tC format. Leaving in Stata Internal Format.") + return date + elif fmt in ["%td", "td"]: + return stata_epoch + datetime.timedelta(int(date)) + elif fmt in ["%tw", "tw"]: # does not count leap days - 7 days is a week + year = datetime.datetime(stata_epoch.year + date // 52, 1, 1) + day_delta = (date % 52) * 7 + return year + datetime.timedelta(int(day_delta)) + elif fmt in ["%tm", "tm"]: + year = stata_epoch.year + date // 12 + month_delta = (date % 12) + 1 + return datetime.datetime(year, month_delta, 1) + elif fmt in ["%tq", "tq"]: + year = stata_epoch.year + date // 4 + month_delta = (date % 4) * 3 + 1 + return datetime.datetime(year, month_delta, 1) + elif fmt in ["%th", "th"]: + year = stata_epoch.year + date // 2 + month_delta = (date % 2) * 6 + 1 + return datetime.datetime(year, month_delta, 1) + elif fmt in ["%ty", "ty"]: + if date > 0: + return datetime.datetime(date, 1, 1) + else: # don't do negative years bc can't mix dtypes in column + raise ValueError("Year 0 and before not implemented") + else: + raise ValueError("Date fmt %s not understood" % fmt) + + +def _datetime_to_stata_elapsed(date, fmt): + """ + Convert from datetime to SIF. http://www.stata.com/help.cgi?datetime + + Parameters + ---------- + date : datetime.datetime + The date to convert to the Stata Internal Format given by fmt + fmt : str + The format to convert to. Can be, tc, td, tw, tm, tq, th, ty + """ + if not isinstance(date, datetime.datetime): + raise ValueError("date should be datetime.datetime format") + stata_epoch = datetime.datetime(1960, 1, 1) + if fmt in ["%tc", "tc"]: + delta = date - stata_epoch + return (delta.days * 86400000 + delta.seconds*1000 + + delta.microseconds/1000) + elif fmt in ["%tC", "tC"]: + from warnings import warn + warn("Stata Internal Format tC not supported.") + return date + elif fmt in ["%td", "td"]: + return (date - stata_epoch).days + elif fmt in ["%tw", "tw"]: + return (52*(date.year-stata_epoch.year) + + (date - datetime.datetime(date.year, 1, 1)).days / 7) + elif fmt in ["%tm", "tm"]: + return (12 * (date.year - stata_epoch.year) + date.month - 1) + elif fmt in ["%tq", "tq"]: + return 4*(date.year-stata_epoch.year) + int((date.month - 1)/3) + elif fmt in ["%th", "th"]: + return 2 * (date.year - stata_epoch.year) + int(date.month > 6) + elif fmt in ["%ty", "ty"]: + return date.year + else: + raise ValueError("fmt %s not understood" % fmt) + + +class StataMissingValue(object): + """ + An observation's missing value. + + Parameters + ----------- + offset + value + + Attributes + ---------- + string + value + + Notes + ----- + More information: + """ + + def __init__(self, offset, value): + self._value = value + if type(value) is int or type(value) is long: + self._str = value - offset is 1 and \ + '.' or ('.' + chr(value - offset + 96)) + else: + self._str = '.' + string = property(lambda self: self._str, doc="The Stata representation of the missing value: '.', '.a'..'.z'") + value = property(lambda self: self._value, doc='The binary representation of the missing value.') + + def __str__(self): + return self._str + + __str__.__doc__ = string.__doc__ + + +class StataParser(object): + def __init__(self, encoding): + if(encoding is None): + self._encoding = 'cp1252' + else: + self._encoding = encoding + + #type code. + #-------------------- + #str1 1 = 0x01 + #str2 2 = 0x02 + #... + #str244 244 = 0xf4 + #byte 251 = 0xfb (sic) + #int 252 = 0xfc + #long 253 = 0xfd + #float 254 = 0xfe + #double 255 = 0xff + #-------------------- + #NOTE: the byte type seems to be reserved for categorical variables + # with a label, but the underlying variable is -127 to 100 + # we're going to drop the label and cast to int + self.DTYPE_MAP = \ + dict( + zip(range(1, 245), ['a' + str(i) for i in range(1, 245)]) + + [ + (251, np.int16), + (252, np.int32), + (253, np.int64), + (254, np.float32), + (255, np.float64) + ] + ) + self.TYPE_MAP = range(251) + list('bhlfd') + #NOTE: technically, some of these are wrong. there are more numbers + # that can be represented. it's the 27 ABOVE and BELOW the max listed + # numeric data type in [U] 12.2.2 of the 11.2 manual + self.MISSING_VALUES = \ + { + 'b': (-127, 100), + 'h': (-32767, 32740), + 'l': (-2147483647, 2147483620), + 'f': (-1.701e+38, +1.701e+38), + 'd': (-1.798e+308, +8.988e+307) + } + + self.OLD_TYPE_MAPPING = \ + { + 'i': 252, + 'f': 254, + 'b': 251 + } + + def _decode_bytes(self, str, errors=None): + if py3compat.PY3: + return str.decode(self._encoding, errors) + else: + return str + + +class StataReader(StataParser): + """ + Class for working with a Stata dataset. There are two possibilities for usage: + + * The from_dta() method on the DataFrame class. + This will return a DataFrame with the Stata dataset. Note that when using the + from_dta() method, you will not have access to meta-information like variable + labels or the data label. + + * Work with this object directly. Upon instantiation, the header of the Stata data + file is read, giving you access to attributes like variable_labels(), data_label(), + nobs(), ... A DataFrame with the data is returned by the read() method; this will + also fill up the value_labels. Note that calling the value_labels() method will + result in an error if the read() method has not been called yet. This is because + the value labels are stored at the end of a Stata dataset, after the data. + + Parameters + ---------- + path_or_buf : string or file-like object + Path to .dta file or object implementing a binary read() functions + encoding : string, None or encoding + Encoding used to parse the files. Note that Stata doesn't + support unicode. None defaults to cp1252. + """ + def __init__(self, path_or_buf, encoding=None): + super(StataReader, self).__init__(encoding) + self.col_sizes = () + self._has_string_data = False + self._missing_values = False + self._data_read = False + self._value_labels_read = False + if isinstance(path_or_buf, str) and _is_url(path_or_buf): + from urllib.request import urlopen + path_or_buf = urlopen(path_or_buf) + if py3compat.PY3: # pragma: no cover + if self._encoding: + errors = 'strict' + else: + errors = 'replace' + self._encoding = 'cp1252' + bytes = path_or_buf.read() + self.path_or_buf = StringIO(self._decode_bytes(bytes, errors)) + elif type(path_or_buf) is str: + self.path_or_buf = open(path_or_buf, 'rb') + else: + self.path_or_buf = path_or_buf + + self._read_header() + + def _read_header(self): + # header + self.format_version = struct.unpack('b', self.path_or_buf.read(1))[0] + if self.format_version not in [104, 105, 108, 113, 114, 115]: + raise ValueError("Version of given Stata file is not 104, 105, 108, 113 (Stata 8/9), 114 (Stata 10/11) or 115 (Stata 12)") + self.byteorder = self.path_or_buf.read(1) == 0x1 and '>' or '<' + self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] + self.path_or_buf.read(1) # unused + + self.nvar = struct.unpack(self.byteorder + 'H', self.path_or_buf.read(2))[0] + self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] + if self.format_version > 105: + self.data_label = self.path_or_buf.read(81) + else: + self.data_label = self.path_or_buf.read(32) + if self.format_version > 104: + self.time_stamp = self.path_or_buf.read(18) + + # descriptors + if self.format_version > 108: + typlist = [ord(self.path_or_buf.read(1)) for i in range(self.nvar)] + else: + typlist = [self.OLD_TYPE_MAPPING[self._decode_bytes(self.path_or_buf.read(1))] for i in range(self.nvar)] + self.typlist = [self.TYPE_MAP[typ] for typ in typlist] + self.dtyplist = [self.DTYPE_MAP[typ] for typ in typlist] + if self.format_version > 108: + self.varlist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)] + else: + self.varlist = [self._null_terminate(self.path_or_buf.read(9)) for i in range(self.nvar)] + self.srtlist = struct.unpack(self.byteorder + ('h' * (self.nvar + 1)), self.path_or_buf.read(2 * (self.nvar + 1)))[:-1] + if self.format_version > 113: + self.fmtlist = [self._null_terminate(self.path_or_buf.read(49)) for i in range(self.nvar)] + elif self.format_version > 104: + self.fmtlist = [self._null_terminate(self.path_or_buf.read(12)) for i in range(self.nvar)] + else: + self.fmtlist = [self._null_terminate(self.path_or_buf.read(7)) for i in range(self.nvar)] + if self.format_version > 108: + self.lbllist = [self._null_terminate(self.path_or_buf.read(33)) for i in range(self.nvar)] + else: + self.lbllist = [self._null_terminate(self.path_or_buf.read(9)) for i in range(self.nvar)] + if self.format_version > 105: + self.vlblist = [self._null_terminate(self.path_or_buf.read(81)) for i in range(self.nvar)] + else: + self.vlblist = [self._null_terminate(self.path_or_buf.read(32)) for i in range(self.nvar)] + + # ignore expansion fields (Format 105 and later) + # When reading, read five bytes; the last four bytes now tell you the + # size of the next read, which you discard. You then continue like + # this until you read 5 bytes of zeros. + + if self.format_version > 104: + while True: + data_type = struct.unpack(self.byteorder + 'b', self.path_or_buf.read(1))[0] + if self.format_version > 108: + data_len = struct.unpack(self.byteorder + 'i', self.path_or_buf.read(4))[0] + else: + data_len = struct.unpack(self.byteorder + 'h', self.path_or_buf.read(2))[0] + if data_type == 0: + break + self.path_or_buf.read(data_len) + + # necessary data to continue parsing + self.data_location = self.path_or_buf.tell() + self.has_string_data = len([x for x in self.typlist if type(x) is int]) > 0 + self._col_size() + + def _calcsize(self, fmt): + return type(fmt) is int and fmt or struct.calcsize(self.byteorder + fmt) + + def _col_size(self, k=None): + """Calculate size of a data record.""" + if len(self.col_sizes) == 0: + self.col_sizes = map(lambda x: self._calcsize(x), self.typlist) + if k is None: + return self.col_sizes + else: + return self.col_sizes[k] + + def _unpack(self, fmt, byt): + d = struct.unpack(self.byteorder + fmt, byt)[0] + if fmt[-1] in self.MISSING_VALUES: + nmin, nmax = self.MISSING_VALUES[fmt[-1]] + if d < nmin or d > nmax: + if self._missing_values: + return StataMissingValue(nmax, d) + else: + return None + return d + + def _null_terminate(self, s): + if py3compat.PY3: # have bytes not strings, so must decode + null_byte = b"\0" + try: + s = s[:s.index(null_byte)] + except: + pass + return s.decode(self._encoding) + else: + null_byte = "\0" + try: + return s.lstrip(null_byte)[:s.index(null_byte)] + except: + return s + + def _next(self): + typlist = self.typlist + if self._has_string_data: + data = [None] * self.nvar + for i in range(len(data)): + if type(typlist[i]) is int: + data[i] = self._null_terminate(self.path_or_buf.read(typlist[i])) + else: + data[i] = self._unpack(typlist[i], self.path_or_buf.read(self._col_size(i))) + return data + else: + return map(lambda i: self._unpack(typlist[i], + self.path_or_buf.read(self._col_size(i))), + range(self.nvar)) + + def _dataset(self): + """ + Returns a Python generator object for iterating over the dataset. + + + Parameters + ---------- + + Returns + ------- + Generator object for iterating over the dataset. Yields each row of + observations as a list by default. + + Notes + ----- + If missing_values is True during instantiation of StataReader then + observations with _StataMissingValue(s) are not filtered and should + be handled by your applcation. + """ + + try: + self._file.seek(self._data_location) + except Exception: + pass + + for i in range(self.nobs): + yield self._next() + + def _read_value_labels(self): + if not self._data_read: + raise Exception("Data has not been read. Because of the layout of Stata files, this is necessary before reading value labels.") + if self._value_labels_read: + raise Exception("Value labels have already been read.") + + self.value_label_dict = dict() + + if self.format_version <= 108: + return # Value labels are not supported in version 108 and earlier. + + while True: + slength = self.path_or_buf.read(4) + if not slength: + break # end of variable lable table + labname = self._null_terminate(self.path_or_buf.read(33)) + self.path_or_buf.read(3) # padding + + n = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] + txtlen = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] + off = [] + for i in range(n): + off.append(struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0]) + val = [] + for i in range(n): + val.append(struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0]) + txt = self.path_or_buf.read(txtlen) + self.value_label_dict[labname] = dict() + for i in range(n): + self.value_label_dict[labname][val[i]] = self._null_terminate(txt[off[i]:]) + self._value_labels_read = True + + def data(self, convert_dates=True, convert_categoricals=True, index=None): + """ + Reads observations from Stata file, converting them into a dataframe + + Parameters + ---------- + convert_dates : boolean, defaults to True + Convert date variables to DataFrame time values + convert_categoricals : boolean, defaults to True + Read value labels and convert columns to Categorical/Factor variables + index : identifier of index column + identifier of column that should be used as index of the DataFrame + + Returns + ------- + y : DataFrame instance + """ + if self._data_read: + raise Exception("Data has already been read.") + self._data_read = True + + stata_dta = self._dataset() + + data = [] + for rownum, line in enumerate(stata_dta): + # doesn't handle missing value objects, just casts + # None will only work without missing value object. + for i, val in enumerate(line): + #NOTE: This will only be scalar types because missing strings + # are empty not None in Stata + if val is None: + line[i] = np.nan + data.append(tuple(line)) + + if convert_categoricals: + self._read_value_labels() + + data = DataFrame(data, columns=self.varlist, index=index) + + cols_ = np.where(self.dtyplist)[0] + for i in cols_: + if self.dtyplist[i] is not None: + col = data.columns[i] + data[col] = Series(data[col], data[col].index, self.dtyplist[i]) + + if convert_dates: + cols = np.where(map(lambda x: x in _date_formats, self.fmtlist))[0] + for i in cols: + col = data.columns[i] + data[col] = data[col].apply(_stata_elapsed_date_to_datetime, args=(self.fmtlist[i],)) + + if convert_categoricals: + cols = np.where(map(lambda x: x in self.value_label_dict.iterkeys(), self.lbllist))[0] + for i in cols: + col = data.columns[i] + labeled_data = np.copy(data[col]) + labeled_data = labeled_data.astype(object) + for k, v in self.value_label_dict[self.lbllist[i]].iteritems(): + labeled_data[data[col] == k] = v + data[col] = Categorical.from_array(labeled_data) + + return data + + def data_label(self): + """Returns data label of Stata file""" + return self.data_label + + def variable_labels(self): + """Returns variable labels as a dict, associating each variable name with corresponding label""" + return dict(zip(self.varlist, self.vlblist)) + + def value_labels(self): + """Returns a dict, associating each variable name a dict, associating each value its corresponding label""" + if not self._value_labels_read: + self._read_value_labels() + + return self.value_label_dict + + +def _open_file_binary_write(fname, encoding): + if hasattr(fname, 'write'): + #if 'b' not in fname.mode: + return fname + return open(fname, "wb") + + +def _set_endianness(endianness): + if endianness.lower() in ["<", "little"]: + return "<" + elif endianness.lower() in [">", "big"]: + return ">" + else: # pragma : no cover + raise ValueError("Endianness %s not understood" % endianness) + + +def _pad_bytes(name, length): + """ + Takes a char string and pads it wih null bytes until it's length chars + """ + return name + "\x00" * (length - len(name)) + + +def _default_names(nvar): + """ + Returns default Stata names v1, v2, ... vnvar + """ + return ["v%d" % i for i in range(1, nvar+1)] + + +def _convert_datetime_to_stata_type(fmt): + """ + Converts from one of the stata date formats to a type in TYPE_MAP + """ + if fmt in ["tc", "%tc", "td", "%td", "tw", "%tw", "tm", "%tm", "tq", + "%tq", "th", "%th", "ty", "%ty"]: + return np.float64 # Stata expects doubles for SIFs + else: + raise ValueError("fmt %s not understood" % fmt) + + +def _maybe_convert_to_int_keys(convert_dates, varlist): + new_dict = {} + for key in convert_dates: + if not convert_dates[key].startswith("%"): # make sure proper fmts + convert_dates[key] = "%" + convert_dates[key] + if key in varlist: + new_dict.update({varlist.index(key): convert_dates[key]}) + else: + if not isinstance(key, int): + raise ValueError("convery_dates key is not in varlist and is not an int") + new_dict.update({key: convert_dates[key]}) + return new_dict + + +def _dtype_to_stata_type(dtype): + """ + Converts dtype types to stata types. Returns the byte of the given ordinal. + See TYPE_MAP and comments for an explanation. This is also explained in + the dta spec. + 1 - 244 are strings of this length + 251 - chr(251) - for int8 and int16, byte + 252 - chr(252) - for int32, int + 253 - chr(253) - for int64, long + 254 - chr(254) - for float32, float + 255 - chr(255) - double, double + + If there are dates to convert, then dtype will already have the correct + type inserted. + """ + #TODO: expand to handle datetime to integer conversion + if dtype.type == np.string_: + return chr(dtype.itemsize) + elif dtype.type == np.object_: # try to coerce it to the biggest string + # not memory efficient, what else could we do? + return chr(244) + elif dtype == np.float64: + return chr(255) + elif dtype == np.float32: + return chr(254) + elif dtype == np.int64: + return chr(253) + elif dtype == np.int32: + return chr(252) + elif dtype == np.int8 or dtype == np.int16: + return chr(251) + else: # pragma : no cover + raise ValueError("Data type %s not currently understood. " + "Please report an error to the developers." % dtype) + + +def _dtype_to_default_stata_fmt(dtype): + """ + Maps numpy dtype to stata's default format for this type. Not terribly + important since users can change this in Stata. Semantics are + + string -> "%DDs" where DD is the length of the string + float64 -> "%10.0g" + float32 -> "%9.0g" + int64 -> "%9.0g" + int32 -> "%12.0g" + int16 -> "%8.0g" + int8 -> "%8.0g" + """ + #TODO: expand this to handle a default datetime format? + if dtype.type == np.string_: + return "%" + str(dtype.itemsize) + "s" + elif dtype.type == np.object_: + return "%244s" + elif dtype == np.float64: + return "%10.0g" + elif dtype == np.float32: + return "%9.0g" + elif dtype == np.int64: + return "%9.0g" + elif dtype == np.int32: + return "%12.0g" + elif dtype == np.int8 or dtype == np.int16: + return "%8.0g" + else: # pragma : no cover + raise ValueError("Data type %s not currently understood. " + "Please report an error to the developers." % dtype) + + +class StataWriter(StataParser): + """ + A class for writing Stata binary dta files from array-like objects + + Parameters + ---------- + fname : file path or buffer + Where to save the dta file. + data : array-like + Array-like input to save. Pandas objects are also accepted. + convert_dates : dict + Dictionary mapping column of datetime types to the stata internal + format that you want to use for the dates. Options are + 'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a + number or a name. + encoding : str + Default is latin-1. Note that Stata does not support unicode. + byteorder : str + Can be ">", "<", "little", or "big". The default is None which uses + `sys.byteorder` + + Returns + ------- + writer : StataWriter instance + The StataWriter instance has a write_file method, which will + write the file to the given `fname`. + + Examples + -------- + >>> writer = StataWriter('./data_file.dta', data) + >>> writer.write_file() + + Or with dates + + >>> writer = StataWriter('./date_data_file.dta', date, {2 : 'tw'}) + >>> writer.write_file() + """ + def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", + byteorder=None): + super(StataWriter, self).__init__(encoding) + self._convert_dates = convert_dates + self._write_index = write_index + # attach nobs, nvars, data, varlist, typlist + self._prepare_pandas(data) + + if byteorder is None: + byteorder = sys.byteorder + self._byteorder = _set_endianness(byteorder) + self._file = _open_file_binary_write(fname, self._encoding) + self.type_converters = {253: np.long, 252: int} + + def _write(self, to_write): + """ + Helper to call encode before writing to file for Python 3 compat. + """ + if py3compat.PY3: + self._file.write(to_write.encode(self._encoding)) + else: + self._file.write(to_write) + + def _prepare_pandas(self, data): + #NOTE: we might need a different API / class for pandas objects so + # we can set different semantics - handle this with a PR to pandas.io + class DataFrameRowIter(object): + def __init__(self, data): + self.data = data + + def __iter__(self): + for i, row in data.iterrows(): + yield row + + if self._write_index: + data = data.reset_index() + self.datarows = DataFrameRowIter(data) + self.nobs, self.nvar = data.shape + self.data = data + self.varlist = data.columns.tolist() + dtypes = data.dtypes + if self._convert_dates is not None: + self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates, self.varlist) + for key in self._convert_dates: + new_type = _convert_datetime_to_stata_type(self._convert_dates[key]) + dtypes[key] = np.dtype(new_type) + self.typlist = [_dtype_to_stata_type(dt) for dt in dtypes] + self.fmtlist = [_dtype_to_default_stata_fmt(dt) for dt in dtypes] + # set the given format for the datetime cols + if self._convert_dates is not None: + for key in self._convert_dates: + self.fmtlist[key] = self._convert_dates[key] + + def write_file(self): + self._write_header() + self._write_descriptors() + self._write_variable_labels() + # write 5 zeros for expansion fields + self._write(_pad_bytes("", 5)) + if self._convert_dates is None: + self._write_data_nodates() + else: + self._write_data_dates() + #self._write_value_labels() + self._file.close() + + def _write_header(self, data_label=None, time_stamp=None): + byteorder = self._byteorder + # ds_format - just use 114 + self._file.write(struct.pack("b", 114)) + # byteorder + self._write(byteorder == ">" and "\x01" or "\x02") + # filetype + self._write("\x01") + # unused + self._write("\x00") + # number of vars, 2 bytes + self._file.write(struct.pack(byteorder+"h", self.nvar)[:2]) + # number of obs, 4 bytes + self._file.write(struct.pack(byteorder+"i", self.nobs)[:4]) + # data label 81 bytes, char, null terminated + if data_label is None: + self._file.write(self._null_terminate(_pad_bytes("", 80))) + else: + self._file.write(self._null_terminate(_pad_bytes(data_label[:80], 80))) + # time stamp, 18 bytes, char, null terminated + # format dd Mon yyyy hh:mm + if time_stamp is None: + time_stamp = datetime.datetime.now() + elif not isinstance(time_stamp, datetime): + raise ValueError("time_stamp should be datetime type") + self._file.write(self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"))) + + def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, + fmtlist=None, lbllist=None): + nvar = self.nvar + # typlist, length nvar, format byte array + for typ in self.typlist: + self._write(typ) + + # varlist, length 33*nvar, char array, null terminated + for name in self.varlist: + name = self._null_terminate(name, True) + name = _pad_bytes(name[:32], 33) + self._write(name) + + # srtlist, 2*(nvar+1), int array, encoded by byteorder + srtlist = _pad_bytes("", (2*(nvar+1))) + self._write(srtlist) + + # fmtlist, 49*nvar, char array + for fmt in self.fmtlist: + self._write(_pad_bytes(fmt, 49)) + + # lbllist, 33*nvar, char array + #NOTE: this is where you could get fancy with pandas categorical type + for i in range(nvar): + self._write(_pad_bytes("", 33)) + + def _write_variable_labels(self, labels=None): + nvar = self.nvar + if labels is None: + for i in range(nvar): + self._write(_pad_bytes("", 81)) + + def _write_data_nodates(self): + data = self.datarows + byteorder = self._byteorder + TYPE_MAP = self.TYPE_MAP + typlist = self.typlist + for row in data: + #row = row.squeeze().tolist() # needed for structured arrays + for i, var in enumerate(row): + typ = ord(typlist[i]) + if typ <= 244: # we've got a string + if len(var) < typ: + var = _pad_bytes(self._decode_bytes(var), len(var) + 1) + self._write(var) + else: + try: + self._file.write(struct.pack(byteorder + TYPE_MAP[typ], var)) + except struct.error: + # have to be strict about type pack won't do any + # kind of casting + self._file.write(struct.pack(byteorder+TYPE_MAP[typ], + self.type_converters[typ](var))) + + def _write_data_dates(self): + convert_dates = self._convert_dates + data = self.datarows + byteorder = self._byteorder + TYPE_MAP = self.TYPE_MAP + MISSING_VALUES = self.MISSING_VALUES + typlist = self.typlist + for row in data: + #row = row.squeeze().tolist() # needed for structured arrays + for i, var in enumerate(row): + typ = ord(typlist[i]) + #NOTE: If anyone finds this terribly slow, there is + # a vectorized way to convert dates, see genfromdta for going + # from int to datetime and reverse it. will copy data though + if i in convert_dates: + var = _datetime_to_stata_elapsed(var, self.fmtlist[i]) + if typ <= 244: # we've got a string + if isnull(var): + var = "" # missing string + if len(var) < typ: + var = _pad_bytes(var, len(var) + 1) + self._write(var) + else: + if isnull(var): # this only matters for floats + var = MISSING_VALUES[typ] + self._write(struct.pack(byteorder+TYPE_MAP[typ], var)) + + def _null_terminate(self, s, as_string=False): + null_byte = '\x00' + if py3compat.PY3 and not as_string: + s += null_byte + return s.encode(self._encoding) + else: + s += null_byte + return s diff --git a/pandas/io/tests/data/stata1.dta b/pandas/io/tests/data/stata1.dta new file mode 100644 index 0000000000000..7df75d0d0cded Binary files /dev/null and b/pandas/io/tests/data/stata1.dta differ diff --git a/pandas/io/tests/data/stata2.dta b/pandas/io/tests/data/stata2.dta new file mode 100644 index 0000000000000..c60cf480ad5dd Binary files /dev/null and b/pandas/io/tests/data/stata2.dta differ diff --git a/pandas/io/tests/data/stata3.csv b/pandas/io/tests/data/stata3.csv new file mode 100644 index 0000000000000..25175f7f706ed --- /dev/null +++ b/pandas/io/tests/data/stata3.csv @@ -0,0 +1,204 @@ +"year","quarter","realgdp","realcons","realinv","realgovt","realdpi","cpi","m1","tbilrate","unemp","pop","infl","realint" +1959,1,2710.349,1707.4,286.898,470.045,1886.9,28.980,139.7,2.82,5.8,177.146,0,0 +1959,2,2778.801,1733.7,310.859,481.301,1919.7,29.150,141.7,3.08,5.1,177.830,2.34,0.74 +1959,3,2775.488,1751.8,289.226,491.260,1916.4,29.350,140.5,3.82,5.3,178.657,2.74,1.09 +1959,4,2785.204,1753.7,299.356,484.052,1931.3,29.370,140,4.33,5.6,179.386,0.27,4.06 +1960,1,2847.699,1770.5,331.722,462.199,1955.5,29.540,139.6,3.50,5.2,180.007,2.31,1.19 +1960,2,2834.390,1792.9,298.152,460.400,1966.1,29.550,140.2,2.68,5.2,180.671,0.14,2.55 +1960,3,2839.022,1785.8,296.375,474.676,1967.8,29.750,140.9,2.36,5.6,181.528,2.7,-0.34 +1960,4,2802.616,1788.2,259.764,476.434,1966.6,29.840,141.1,2.29,6.3,182.287,1.21,1.08 +1961,1,2819.264,1787.7,266.405,475.854,1984.5,29.810,142.1,2.37,6.8,182.992,-0.4,2.77 +1961,2,2872.005,1814.3,286.246,480.328,2014.4,29.920,142.9,2.29,7,183.691,1.47,0.81 +1961,3,2918.419,1823.1,310.227,493.828,2041.9,29.980,144.1,2.32,6.8,184.524,0.8,1.52 +1961,4,2977.830,1859.6,315.463,502.521,2082.0,30.040,145.2,2.60,6.2,185.242,0.8,1.8 +1962,1,3031.241,1879.4,334.271,520.960,2101.7,30.210,146.4,2.73,5.6,185.874,2.26,0.47 +1962,2,3064.709,1902.5,331.039,523.066,2125.2,30.220,146.5,2.78,5.5,186.538,0.13,2.65 +1962,3,3093.047,1917.9,336.962,538.838,2137.0,30.380,146.7,2.78,5.6,187.323,2.11,0.67 +1962,4,3100.563,1945.1,325.650,535.912,2154.6,30.440,148.3,2.87,5.5,188.013,0.79,2.08 +1963,1,3141.087,1958.2,343.721,522.917,2172.5,30.480,149.7,2.90,5.8,188.580,0.53,2.38 +1963,2,3180.447,1976.9,348.730,518.108,2193.1,30.690,151.3,3.03,5.7,189.242,2.75,0.29 +1963,3,3240.332,2003.8,360.102,546.893,2217.9,30.750,152.6,3.38,5.5,190.028,0.78,2.6 +1963,4,3264.967,2020.6,364.534,532.383,2254.6,30.940,153.7,3.52,5.6,190.668,2.46,1.06 +1964,1,3338.246,2060.5,379.523,529.686,2299.6,30.950,154.8,3.51,5.5,191.245,0.13,3.38 +1964,2,3376.587,2096.7,377.778,526.175,2362.1,31.020,156.8,3.47,5.2,191.889,0.9,2.57 +1964,3,3422.469,2135.2,386.754,522.008,2392.7,31.120,159.2,3.53,5,192.631,1.29,2.25 +1964,4,3431.957,2141.2,389.910,514.603,2420.4,31.280,160.7,3.76,5,193.223,2.05,1.71 +1965,1,3516.251,2188.8,429.145,508.006,2447.4,31.380,162,3.93,4.9,193.709,1.28,2.65 +1965,2,3563.960,2213.0,429.119,508.931,2474.5,31.580,163.1,3.84,4.7,194.303,2.54,1.3 +1965,3,3636.285,2251.0,444.444,529.446,2542.6,31.650,166,3.93,4.4,194.997,0.89,3.04 +1965,4,3724.014,2314.3,446.493,544.121,2594.1,31.880,169.1,4.35,4.1,195.539,2.9,1.46 +1966,1,3815.423,2348.5,484.244,556.593,2618.4,32.280,171.8,4.62,3.9,195.999,4.99,-0.37 +1966,2,3828.124,2354.5,475.408,571.371,2624.7,32.450,170.3,4.65,3.8,196.560,2.1,2.55 +1966,3,3853.301,2381.5,470.697,594.514,2657.8,32.850,171.2,5.23,3.8,197.207,4.9,0.33 +1966,4,3884.520,2391.4,472.957,599.528,2688.2,32.900,171.9,5.00,3.7,197.736,0.61,4.39 +1967,1,3918.740,2405.3,460.007,640.682,2728.4,33.100,174.2,4.22,3.8,198.206,2.42,1.8 +1967,2,3919.556,2438.1,440.393,631.430,2750.8,33.400,178.1,3.78,3.8,198.712,3.61,0.17 +1967,3,3950.826,2450.6,453.033,641.504,2777.1,33.700,181.6,4.42,3.8,199.311,3.58,0.84 +1967,4,3980.970,2465.7,462.834,640.234,2797.4,34.100,184.3,4.90,3.9,199.808,4.72,0.18 +1968,1,4063.013,2524.6,472.907,651.378,2846.2,34.400,186.6,5.18,3.7,200.208,3.5,1.67 +1968,2,4131.998,2563.3,492.026,646.145,2893.5,34.900,190.5,5.50,3.5,200.706,5.77,-0.28 +1968,3,4160.267,2611.5,476.053,640.615,2899.3,35.300,194,5.21,3.5,201.290,4.56,0.65 +1968,4,4178.293,2623.5,480.998,636.729,2918.4,35.700,198.7,5.85,3.4,201.760,4.51,1.34 +1969,1,4244.100,2652.9,512.686,633.224,2923.4,36.300,200.7,6.08,3.4,202.161,6.67,-0.58 +1969,2,4256.460,2669.8,508.601,623.160,2952.9,36.800,201.7,6.49,3.4,202.677,5.47,1.02 +1969,3,4283.378,2682.7,520.360,623.613,3012.9,37.300,202.9,7.02,3.6,203.302,5.4,1.63 +1969,4,4263.261,2704.1,492.334,606.900,3034.9,37.900,206.2,7.64,3.6,203.849,6.38,1.26 +1970,1,4256.573,2720.7,476.925,594.888,3050.1,38.500,206.7,6.76,4.2,204.401,6.28,0.47 +1970,2,4264.289,2733.2,478.419,576.257,3103.5,38.900,208,6.66,4.8,205.052,4.13,2.52 +1970,3,4302.259,2757.1,486.594,567.743,3145.4,39.400,212.9,6.15,5.2,205.788,5.11,1.04 +1970,4,4256.637,2749.6,458.406,564.666,3135.1,39.900,215.5,4.86,5.8,206.466,5.04,-0.18 +1971,1,4374.016,2802.2,517.935,542.709,3197.3,40.100,220,3.65,5.9,207.065,2,1.65 +1971,2,4398.829,2827.9,533.986,534.905,3245.3,40.600,224.9,4.76,5.9,207.661,4.96,-0.19 +1971,3,4433.943,2850.4,541.010,532.646,3259.7,40.900,227.2,4.70,6,208.345,2.94,1.75 +1971,4,4446.264,2897.8,524.085,516.140,3294.2,41.200,230.1,3.87,6,208.917,2.92,0.95 +1972,1,4525.769,2936.5,561.147,518.192,3314.9,41.500,235.6,3.55,5.8,209.386,2.9,0.64 +1972,2,4633.101,2992.6,595.495,526.473,3346.1,41.800,238.8,3.86,5.7,209.896,2.88,0.98 +1972,3,4677.503,3038.8,603.970,498.116,3414.6,42.200,245,4.47,5.6,210.479,3.81,0.66 +1972,4,4754.546,3110.1,607.104,496.540,3550.5,42.700,251.5,5.09,5.3,210.985,4.71,0.38 +1973,1,4876.166,3167.0,645.654,504.838,3590.7,43.700,252.7,5.98,5,211.420,9.26,-3.28 +1973,2,4932.571,3165.4,675.837,497.033,3626.2,44.200,257.5,7.19,4.9,211.909,4.55,2.64 +1973,3,4906.252,3176.7,649.412,475.897,3644.4,45.600,259,8.06,4.8,212.475,12.47,-4.41 +1973,4,4953.050,3167.4,674.253,476.174,3688.9,46.800,263.8,7.68,4.8,212.932,10.39,-2.71 +1974,1,4909.617,3139.7,631.230,491.043,3632.3,48.100,267.2,7.80,5.1,213.361,10.96,-3.16 +1974,2,4922.188,3150.6,628.102,490.177,3601.1,49.300,269.3,7.89,5.2,213.854,9.86,-1.96 +1974,3,4873.520,3163.6,592.672,492.586,3612.4,51.000,272.3,8.16,5.6,214.451,13.56,-5.4 +1974,4,4854.340,3117.3,598.306,496.176,3596.0,52.300,273.9,6.96,6.6,214.931,10.07,-3.11 +1975,1,4795.295,3143.4,493.212,490.603,3581.9,53.000,276.2,5.53,8.2,215.353,5.32,0.22 +1975,2,4831.942,3195.8,476.085,486.679,3749.3,54.000,283.7,5.57,8.9,215.973,7.48,-1.91 +1975,3,4913.328,3241.4,516.402,498.836,3698.6,54.900,285.4,6.27,8.5,216.587,6.61,-0.34 +1975,4,4977.511,3275.7,530.596,500.141,3736.0,55.800,288.4,5.26,8.3,217.095,6.5,-1.24 +1976,1,5090.663,3341.2,585.541,495.568,3791.0,56.100,294.7,4.91,7.7,217.528,2.14,2.77 +1976,2,5128.947,3371.8,610.513,494.532,3822.2,57.000,297.2,5.28,7.6,218.035,6.37,-1.09 +1976,3,5154.072,3407.5,611.646,493.141,3856.7,57.900,302,5.05,7.7,218.644,6.27,-1.22 +1976,4,5191.499,3451.8,615.898,494.415,3884.4,58.700,308.3,4.57,7.8,219.179,5.49,-0.92 +1977,1,5251.762,3491.3,646.198,498.509,3887.5,60.000,316,4.60,7.5,219.684,8.76,-4.16 +1977,2,5356.131,3510.6,696.141,506.695,3931.8,60.800,320.2,5.06,7.1,220.239,5.3,-0.24 +1977,3,5451.921,3544.1,734.078,509.605,3990.8,61.600,326.4,5.82,6.9,220.904,5.23,0.59 +1977,4,5450.793,3597.5,713.356,504.584,4071.2,62.700,334.4,6.20,6.6,221.477,7.08,-0.88 +1978,1,5469.405,3618.5,727.504,506.314,4096.4,63.900,339.9,6.34,6.3,221.991,7.58,-1.24 +1978,2,5684.569,3695.9,777.454,518.366,4143.4,65.500,347.6,6.72,6,222.585,9.89,-3.18 +1978,3,5740.300,3711.4,801.452,520.199,4177.1,67.100,353.3,7.64,6,223.271,9.65,-2.01 +1978,4,5816.222,3741.3,819.689,524.782,4209.8,68.500,358.6,9.02,5.9,223.865,8.26,0.76 +1979,1,5825.949,3760.2,819.556,525.524,4255.9,70.600,368,9.42,5.9,224.438,12.08,-2.66 +1979,2,5831.418,3758.0,817.660,532.040,4226.1,73.000,377.2,9.30,5.7,225.055,13.37,-4.07 +1979,3,5873.335,3794.9,801.742,531.232,4250.3,75.200,380.8,10.49,5.9,225.801,11.88,-1.38 +1979,4,5889.495,3805.0,786.817,531.126,4284.3,78.000,385.8,11.94,5.9,226.451,14.62,-2.68 +1980,1,5908.467,3798.4,781.114,548.115,4296.2,80.900,383.8,13.75,6.3,227.061,14.6,-0.85 +1980,2,5787.373,3712.2,710.640,561.895,4236.1,82.600,394,7.90,7.3,227.726,8.32,-0.42 +1980,3,5776.617,3752.0,656.477,554.292,4279.7,84.700,409,10.34,7.7,228.417,10.04,0.3 +1980,4,5883.460,3802.0,723.220,556.130,4368.1,87.200,411.3,14.75,7.4,228.937,11.64,3.11 +1981,1,6005.717,3822.8,795.091,567.618,4358.1,89.100,427.4,13.95,7.4,229.403,8.62,5.32 +1981,2,5957.795,3822.8,757.240,584.540,4358.6,91.500,426.9,15.33,7.4,229.966,10.63,4.69 +1981,3,6030.184,3838.3,804.242,583.890,4455.4,93.400,428.4,14.58,7.4,230.641,8.22,6.36 +1981,4,5955.062,3809.3,773.053,590.125,4464.4,94.400,442.7,11.33,8.2,231.157,4.26,7.07 +1982,1,5857.333,3833.9,692.514,591.043,4469.6,95.000,447.1,12.95,8.8,231.645,2.53,10.42 +1982,2,5889.074,3847.7,691.900,596.403,4500.8,97.500,448,11.97,9.4,232.188,10.39,1.58 +1982,3,5866.370,3877.2,683.825,605.370,4520.6,98.100,464.5,8.10,9.9,232.816,2.45,5.65 +1982,4,5871.001,3947.9,622.930,623.307,4536.4,97.900,477.2,7.96,10.7,233.322,-0.82,8.77 +1983,1,5944.020,3986.6,645.110,630.873,4572.2,98.800,493.2,8.22,10.4,233.781,3.66,4.56 +1983,2,6077.619,4065.7,707.372,644.322,4605.5,99.800,507.8,8.69,10.1,234.307,4.03,4.66 +1983,3,6197.468,4137.6,754.937,662.412,4674.7,100.800,517.2,8.99,9.4,234.907,3.99,5.01 +1983,4,6325.574,4203.2,834.427,639.197,4771.1,102.100,525.1,8.89,8.5,235.385,5.13,3.76 +1984,1,6448.264,4239.2,921.763,644.635,4875.4,103.300,535,9.43,7.9,235.839,4.67,4.76 +1984,2,6559.594,4299.9,952.841,664.839,4959.4,104.100,540.9,9.94,7.5,236.348,3.09,6.85 +1984,3,6623.343,4333.0,974.989,662.294,5036.6,105.100,543.7,10.19,7.4,236.976,3.82,6.37 +1984,4,6677.264,4390.1,958.993,684.282,5084.5,105.700,557,8.14,7.3,237.468,2.28,5.87 +1985,1,6740.275,4464.6,927.375,691.613,5072.0,107.000,570.4,8.25,7.3,237.900,4.89,3.36 +1985,2,6797.344,4505.2,943.383,708.524,5172.7,107.700,589.1,7.17,7.3,238.466,2.61,4.56 +1985,3,6903.523,4590.8,932.959,732.305,5140.7,108.500,607.8,7.13,7.2,239.113,2.96,4.17 +1985,4,6955.918,4600.9,969.434,732.026,5193.9,109.900,621.4,7.14,7,239.638,5.13,2.01 +1986,1,7022.757,4639.3,967.442,728.125,5255.8,108.700,641,6.56,7,240.094,-4.39,10.95 +1986,2,7050.969,4688.7,945.972,751.334,5315.5,109.500,670.3,6.06,7.2,240.651,2.93,3.13 +1986,3,7118.950,4770.7,916.315,779.770,5343.3,110.200,694.9,5.31,7,241.274,2.55,2.76 +1986,4,7153.359,4799.4,917.736,767.671,5346.5,111.400,730.2,5.44,6.8,241.784,4.33,1.1 +1987,1,7193.019,4792.1,945.776,772.247,5379.4,112.700,743.9,5.61,6.6,242.252,4.64,0.97 +1987,2,7269.510,4856.3,947.100,782.962,5321.0,113.800,743,5.67,6.3,242.804,3.89,1.79 +1987,3,7332.558,4910.4,948.055,783.804,5416.2,115.000,756.2,6.19,6,243.446,4.2,1.99 +1987,4,7458.022,4922.2,1021.980,795.467,5493.1,116.000,756.2,5.76,5.9,243.981,3.46,2.29 +1988,1,7496.600,5004.4,964.398,773.851,5562.1,117.200,768.1,5.76,5.7,244.445,4.12,1.64 +1988,2,7592.881,5040.8,987.858,765.980,5614.3,118.500,781.4,6.48,5.5,245.021,4.41,2.07 +1988,3,7632.082,5080.6,994.204,760.245,5657.5,119.900,783.3,7.22,5.5,245.693,4.7,2.52 +1988,4,7733.991,5140.4,1007.371,783.065,5708.5,121.200,785.7,8.03,5.3,246.224,4.31,3.72 +1989,1,7806.603,5159.3,1045.975,767.024,5773.4,123.100,779.2,8.67,5.2,246.721,6.22,2.44 +1989,2,7865.016,5182.4,1033.753,784.275,5749.8,124.500,777.8,8.15,5.2,247.342,4.52,3.63 +1989,3,7927.393,5236.1,1021.604,791.819,5787.0,125.400,786.6,7.76,5.3,248.067,2.88,4.88 +1989,4,7944.697,5261.7,1011.119,787.844,5831.3,127.500,795.4,7.65,5.4,248.659,6.64,1.01 +1990,1,8027.693,5303.3,1021.070,799.681,5875.1,128.900,806.2,7.80,5.3,249.306,4.37,3.44 +1990,2,8059.598,5320.8,1021.360,800.639,5913.9,130.500,810.1,7.70,5.3,250.132,4.93,2.76 +1990,3,8059.476,5341.0,997.319,793.513,5918.1,133.400,819.8,7.33,5.7,251.057,8.79,-1.46 +1990,4,7988.864,5299.5,934.248,800.525,5878.2,134.700,827.2,6.67,6.1,251.889,3.88,2.79 +1991,1,7950.164,5284.4,896.210,806.775,5896.3,135.100,843.2,5.83,6.6,252.643,1.19,4.65 +1991,2,8003.822,5324.7,891.704,809.081,5941.1,136.200,861.5,5.54,6.8,253.493,3.24,2.29 +1991,3,8037.538,5345.0,913.904,793.987,5953.6,137.200,878,5.18,6.9,254.435,2.93,2.25 +1991,4,8069.046,5342.6,948.891,778.378,5992.4,138.300,910.4,4.14,7.1,255.214,3.19,0.95 +1992,1,8157.616,5434.5,927.796,778.568,6082.9,139.400,943.8,3.88,7.4,255.992,3.17,0.71 +1992,2,8244.294,5466.7,988.912,777.762,6129.5,140.500,963.2,3.50,7.6,256.894,3.14,0.36 +1992,3,8329.361,5527.1,999.135,786.639,6160.6,141.700,1003.8,2.97,7.6,257.861,3.4,-0.44 +1992,4,8417.016,5594.6,1030.758,787.064,6248.2,142.800,1030.4,3.12,7.4,258.679,3.09,0.02 +1993,1,8432.485,5617.2,1054.979,762.901,6156.5,143.800,1047.6,2.92,7.2,259.414,2.79,0.13 +1993,2,8486.435,5671.1,1063.263,752.158,6252.3,144.500,1084.5,3.02,7.1,260.255,1.94,1.08 +1993,3,8531.108,5732.7,1062.514,744.227,6265.7,145.600,1113,3.00,6.8,261.163,3.03,-0.04 +1993,4,8643.769,5783.7,1118.583,748.102,6358.1,146.300,1131.6,3.05,6.6,261.919,1.92,1.13 +1994,1,8727.919,5848.1,1166.845,721.288,6332.6,147.200,1141.1,3.48,6.6,262.631,2.45,1.02 +1994,2,8847.303,5891.5,1234.855,717.197,6440.6,148.400,1150.5,4.20,6.2,263.436,3.25,0.96 +1994,3,8904.289,5938.7,1212.655,736.890,6487.9,149.400,1150.1,4.68,6,264.301,2.69,2 +1994,4,9003.180,5997.3,1269.190,716.702,6574.0,150.500,1151.4,5.53,5.6,265.044,2.93,2.6 +1995,1,9025.267,6004.3,1282.090,715.326,6616.6,151.800,1149.3,5.72,5.5,265.755,3.44,2.28 +1995,2,9044.668,6053.5,1247.610,712.492,6617.2,152.600,1145.4,5.52,5.7,266.557,2.1,3.42 +1995,3,9120.684,6107.6,1235.601,707.649,6666.8,153.500,1137.3,5.32,5.7,267.456,2.35,2.97 +1995,4,9184.275,6150.6,1270.392,681.081,6706.2,154.700,1123.5,5.17,5.6,268.151,3.11,2.05 +1996,1,9247.188,6206.9,1287.128,695.265,6777.7,156.100,1124.8,4.91,5.5,268.853,3.6,1.31 +1996,2,9407.052,6277.1,1353.795,705.172,6850.6,157.000,1112.4,5.09,5.5,269.667,2.3,2.79 +1996,3,9488.879,6314.6,1422.059,692.741,6908.9,158.200,1086.1,5.04,5.3,270.581,3.05,2 +1996,4,9592.458,6366.1,1418.193,690.744,6946.8,159.400,1081.5,4.99,5.3,271.360,3.02,1.97 +1997,1,9666.235,6430.2,1451.304,681.445,7008.9,159.900,1063.8,5.10,5.2,272.083,1.25,3.85 +1997,2,9809.551,6456.2,1543.976,693.525,7061.5,160.400,1066.2,5.01,5,272.912,1.25,3.76 +1997,3,9932.672,6566.0,1571.426,691.261,7142.4,161.500,1065.5,5.02,4.9,273.852,2.73,2.29 +1997,4,10008.874,6641.1,1596.523,690.311,7241.5,162.000,1074.4,5.11,4.7,274.626,1.24,3.88 +1998,1,10103.425,6707.2,1672.732,668.783,7406.2,162.200,1076.1,5.02,4.6,275.304,0.49,4.53 +1998,2,10194.277,6822.6,1652.716,687.184,7512.0,163.200,1075,4.98,4.4,276.115,2.46,2.52 +1998,3,10328.787,6913.1,1700.071,681.472,7591.0,163.900,1086,4.49,4.5,277.003,1.71,2.78 +1998,4,10507.575,7019.1,1754.743,688.147,7646.5,164.700,1097.8,4.38,4.4,277.790,1.95,2.43 +1999,1,10601.179,7088.3,1809.993,683.601,7698.4,165.900,1101.9,4.39,4.3,278.451,2.9,1.49 +1999,2,10684.049,7199.9,1803.674,683.594,7716.0,166.700,1098.7,4.54,4.3,279.295,1.92,2.62 +1999,3,10819.914,7286.4,1848.949,697.936,7765.9,168.100,1102.3,4.75,4.2,280.203,3.35,1.41 +1999,4,11014.254,7389.2,1914.567,713.445,7887.7,169.300,1121.9,5.20,4.1,280.976,2.85,2.35 +2000,1,11043.044,7501.3,1887.836,685.216,8053.4,170.900,1113.5,5.63,4,281.653,3.76,1.87 +2000,2,11258.454,7571.8,2018.529,712.641,8135.9,172.700,1103,5.81,3.9,282.385,4.19,1.62 +2000,3,11267.867,7645.9,1986.956,698.827,8222.3,173.900,1098.7,6.07,4,283.190,2.77,3.3 +2000,4,11334.544,7713.5,1987.845,695.597,8234.6,175.600,1097.7,5.70,3.9,283.900,3.89,1.81 +2001,1,11297.171,7744.3,1882.691,710.403,8296.5,176.400,1114.9,4.39,4.2,284.550,1.82,2.57 +2001,2,11371.251,7773.5,1876.650,725.623,8273.7,177.400,1139.7,3.54,4.4,285.267,2.26,1.28 +2001,3,11340.075,7807.7,1837.074,730.493,8484.5,177.600,1166,2.72,4.8,286.047,0.45,2.27 +2001,4,11380.128,7930.0,1731.189,739.318,8385.5,177.700,1190.9,1.74,5.5,286.728,0.23,1.51 +2002,1,11477.868,7957.3,1789.327,756.915,8611.6,179.300,1185.9,1.75,5.7,287.328,3.59,-1.84 +2002,2,11538.770,7997.8,1810.779,774.408,8658.9,180.000,1199.5,1.70,5.8,288.028,1.56,0.14 +2002,3,11596.430,8052.0,1814.531,786.673,8629.2,181.200,1204,1.61,5.7,288.783,2.66,-1.05 +2002,4,11598.824,8080.6,1813.219,799.967,8649.6,182.600,1226.8,1.20,5.8,289.421,3.08,-1.88 +2003,1,11645.819,8122.3,1813.141,800.196,8681.3,183.200,1248.4,1.14,5.9,290.019,1.31,-0.17 +2003,2,11738.706,8197.8,1823.698,838.775,8812.5,183.700,1287.9,0.96,6.2,290.704,1.09,-0.13 +2003,3,11935.461,8312.1,1889.883,839.598,8935.4,184.900,1297.3,0.94,6.1,291.449,2.6,-1.67 +2003,4,12042.817,8358.0,1959.783,845.722,8986.4,186.300,1306.1,0.90,5.8,292.057,3.02,-2.11 +2004,1,12127.623,8437.6,1970.015,856.570,9025.9,187.400,1332.1,0.94,5.7,292.635,2.35,-1.42 +2004,2,12213.818,8483.2,2055.580,861.440,9115.0,189.100,1340.5,1.21,5.6,293.310,3.61,-2.41 +2004,3,12303.533,8555.8,2082.231,876.385,9175.9,190.800,1361,1.63,5.4,294.066,3.58,-1.95 +2004,4,12410.282,8654.2,2125.152,865.596,9303.4,191.800,1366.6,2.20,5.4,294.741,2.09,0.11 +2005,1,12534.113,8719.0,2170.299,869.204,9189.6,193.800,1357.8,2.69,5.3,295.308,4.15,-1.46 +2005,2,12587.535,8802.9,2131.468,870.044,9253.0,194.700,1366.6,3.01,5.1,295.994,1.85,1.16 +2005,3,12683.153,8865.6,2154.949,890.394,9308.0,199.200,1375,3.52,5,296.770,9.14,-5.62 +2005,4,12748.699,8888.5,2232.193,875.557,9358.7,199.400,1380.6,4.00,4.9,297.435,0.4,3.6 +2006,1,12915.938,8986.6,2264.721,900.511,9533.8,200.700,1380.5,4.51,4.7,298.061,2.6,1.91 +2006,2,12962.462,9035.0,2261.247,892.839,9617.3,202.700,1369.2,4.82,4.7,298.766,3.97,0.85 +2006,3,12965.916,9090.7,2229.636,892.002,9662.5,201.900,1369.4,4.90,4.7,299.593,-1.58,6.48 +2006,4,13060.679,9181.6,2165.966,894.404,9788.8,203.574,1373.6,4.92,4.4,300.320,3.3,1.62 +2007,1,13099.901,9265.1,2132.609,882.766,9830.2,205.920,1379.7,4.95,4.5,300.977,4.58,0.36 +2007,2,13203.977,9291.5,2162.214,898.713,9842.7,207.338,1370,4.72,4.5,301.714,2.75,1.97 +2007,3,13321.109,9335.6,2166.491,918.983,9883.9,209.133,1379.2,4.00,4.7,302.509,3.45,0.55 +2007,4,13391.249,9363.6,2123.426,925.110,9886.2,212.495,1377.4,3.01,4.8,303.204,6.38,-3.37 +2008,1,13366.865,9349.6,2082.886,943.372,9826.8,213.997,1384,1.56,4.9,303.803,2.82,-1.26 +2008,2,13415.266,9351.0,2026.518,961.280,10059.0,218.610,1409.3,1.74,5.4,304.483,8.53,-6.79 +2008,3,13324.600,9267.7,1990.693,991.551,9838.3,216.889,1474.7,1.17,6,305.270,-3.16,4.33 +2008,4,13141.920,9195.3,1857.661,1007.273,9920.4,212.174,1576.5,0.12,6.9,305.952,-8.79,8.91 +2009,1,12925.410,9209.2,1558.494,996.287,9926.4,212.671,1592.8,0.22,8.1,306.547,0.94,-0.71 +2009,2,12901.504,9189.0,1456.678,1023.528,10077.5,214.469,1653.6,0.18,9.2,307.226,3.37,-3.19 +2009,3,12990.341,9256.0,1486.398,1044.088,10040.6,216.385,1673.9,0.12,9.6,308.013,3.56,-3.44 diff --git a/pandas/io/tests/data/stata3.dta b/pandas/io/tests/data/stata3.dta new file mode 100644 index 0000000000000..265fbcc3a8187 Binary files /dev/null and b/pandas/io/tests/data/stata3.dta differ diff --git a/pandas/io/tests/data/stata4.dta b/pandas/io/tests/data/stata4.dta new file mode 100644 index 0000000000000..c5d7de8b42295 Binary files /dev/null and b/pandas/io/tests/data/stata4.dta differ diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py new file mode 100644 index 0000000000000..79cec2870d687 --- /dev/null +++ b/pandas/io/tests/test_stata.py @@ -0,0 +1,193 @@ +# pylint: disable=E1101 + +from datetime import datetime +import os +import unittest + +import warnings +import nose + +import numpy as np + +from pandas.core.frame import DataFrame +from pandas.io.parsers import read_csv +from pandas.io.stata import read_stata, StataReader, StataWriter +import pandas.util.testing as tm + + +def curpath(): + pth, _ = os.path.split(os.path.abspath(__file__)) + return pth + + +class StataTests(unittest.TestCase): + + def setUp(self): + # Unit test datasets for dta7 - dta9 (old stata formats 104, 105 and 107) can be downloaded from: + # http://stata-press.com/data/glmext.html + self.dirpath = tm.get_data_path() + self.dta1 = os.path.join(self.dirpath, 'stata1.dta') + self.dta2 = os.path.join(self.dirpath, 'stata2.dta') + self.dta3 = os.path.join(self.dirpath, 'stata3.dta') + self.csv3 = os.path.join(self.dirpath, 'stata3.csv') + self.dta4 = os.path.join(self.dirpath, 'stata4.dta') + self.dta5 = os.path.join(self.dirpath, 'stata5.dta') + self.dta6 = os.path.join(self.dirpath, 'stata6.dta') + self.dta7 = os.path.join(self.dirpath, 'cancer.dta') + self.csv7 = os.path.join(self.dirpath, 'cancer.csv') + self.dta8 = os.path.join(self.dirpath, 'tbl19-3.dta') + self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv') + self.dta9 = os.path.join(self.dirpath, 'lbw.dta') + self.csv9 = os.path.join(self.dirpath, 'lbw.csv') + + def read_dta(self, file): + return read_stata(file, convert_dates=True) + + def read_csv(self, file): + return read_csv(file, parse_dates=True) + + def test_read_dta1(self): + reader = StataReader(self.dta1) + parsed = reader.data() + # Pandas uses np.nan as missing value. Thus, all columns will be of type float, regardless of their name. + expected = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) + + for i, col in enumerate(parsed.columns): + np.testing.assert_almost_equal( + parsed[col], + expected[expected.columns[i]] + ) + + def test_read_dta2(self): + expected = DataFrame.from_records( + [ + ( + datetime(2006, 11, 19, 23, 13, 20), + 1479596223000, + datetime(2010, 1, 20), + datetime(2010, 1, 8), + datetime(2010, 1, 1), + datetime(1974, 7, 1), + datetime(2010, 1, 1), + datetime(2010, 1, 1) + ), + ( + datetime(1959, 12, 31, 20, 3, 20), + -1479590, + datetime(1953, 10, 2), + datetime(1948, 6, 10), + datetime(1955, 1, 1), + datetime(1955, 7, 1), + datetime(1955, 1, 1), + datetime(2, 1, 1) + ), + ( + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('NaT'), + np.datetime64('NaT') + ) + ], + columns=['datetime_c', 'datetime_big_c', 'date', 'weekly_date', 'monthly_date', 'quarterly_date', 'half_yearly_date', 'yearly_date'] + ) + + with warnings.catch_warnings(record=True) as w: + parsed = self.read_dta(self.dta2) + np.testing.assert_equal( + len(w), 1) # should get a warning for that format. + + tm.assert_frame_equal(parsed, expected) + + def test_read_dta3(self): + parsed = self.read_dta(self.dta3) + expected = self.read_csv(self.csv3) + for i, col in enumerate(parsed.columns): + np.testing.assert_almost_equal( + parsed[col], + expected[expected.columns[i]], + decimal=3 + ) + + def test_read_dta4(self): + parsed = self.read_dta(self.dta4) + expected = DataFrame.from_records( + [ + ["one", "ten", "one", "one", "one"], + ["two", "nine", "two", "two", "two"], + ["three", "eight", "three", "three", "three"], + ["four", "seven", 4, "four", "four"], + ["five", "six", 5, np.nan, "five"], + ["six", "five", 6, np.nan, "six"], + ["seven", "four", 7, np.nan, "seven"], + ["eight", "three", 8, np.nan, "eight"], + ["nine", "two", 9, np.nan, "nine"], + ["ten", "one", "ten", np.nan, "ten"] + ], + columns=['fully_labeled', 'fully_labeled2', 'incompletely_labeled', 'labeled_with_missings', 'float_labelled']) + + tm.assert_frame_equal(parsed, expected) + + def test_write_dta5(self): + original = DataFrame([(np.nan, np.nan, np.nan, np.nan, np.nan)], + columns=['float_miss', 'double_miss', 'byte_miss', 'int_miss', 'long_miss']) + + writer = StataWriter(self.dta5, original, None, False) + writer.write_file() + + written_and_read_again = self.read_dta(self.dta5) + tm.assert_frame_equal(written_and_read_again, original) + + def test_write_dta6(self): + original = self.read_csv(self.csv3) + + writer = StataWriter(self.dta6, original, None, False) + writer.write_file() + + written_and_read_again = self.read_dta(self.dta6) + tm.assert_frame_equal(written_and_read_again, original) + + @nose.tools.nottest + def test_read_dta7(self): + expected = read_csv(self.csv7, parse_dates=True, sep='\t') + parsed = self.read_dta(self.dta7) + + for i, col in enumerate(parsed.columns): + np.testing.assert_almost_equal( + parsed[col], + expected[expected.columns[i]], + decimal=3 + ) + + @nose.tools.nottest + def test_read_dta8(self): + expected = read_csv(self.csv8, parse_dates=True, sep='\t') + parsed = self.read_dta(self.dta8) + + for i, col in enumerate(parsed.columns): + np.testing.assert_almost_equal( + parsed[col], + expected[expected.columns[i]], + decimal=3 + ) + + @nose.tools.nottest + def test_read_dta9(self): + expected = read_csv(self.csv9, parse_dates=True, sep='\t') + parsed = self.read_dta(self.dta9) + + for i, col in enumerate(parsed.columns): + np.testing.assert_equal( + parsed[col], + expected[expected.columns[i]], + decimal=3 + ) + + +if __name__ == '__main__': + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], + exit=False) diff --git a/setup.py b/setup.py index 3e56144e25378..d1b1382e26dbc 100755 --- a/setup.py +++ b/setup.py @@ -506,6 +506,7 @@ def pxd(name): 'tests/data/legacy_pickle/0.10.1/*.pickle', 'tests/data/legacy_pickle/0.11.0/*.pickle', 'tests/data/*.csv', + 'tests/data/*.dta', 'tests/data/*.txt', 'tests/data/*.xls', 'tests/data/*.xlsx',