Skip to content

Commit

Permalink
ENH: extensively refactor BlockJoinOperation to support n > 2, Concat…
Browse files Browse the repository at this point in the history
…enator class to orchestrate concatenations, #273, #479
  • Loading branch information
wesm committed Jan 5, 2012
1 parent eef27e6 commit 3b1c5b7
Show file tree
Hide file tree
Showing 11 changed files with 547 additions and 413 deletions.
1 change: 1 addition & 0 deletions RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ pandas 0.7.0
5-10x in most typical use cases (GH #374)
- Some performance enhancements in constructing a Panel from a dict of
DataFrame objects
- Made ``Index._get_duplicates`` a public method by removing the underscore

**Bug fixes**

Expand Down
119 changes: 17 additions & 102 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import numpy.ma as ma

from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
_default_index, _stringify, _maybe_upcast)
_default_index, _stringify)
from pandas.core.daterange import DateRange
from pandas.core.generic import NDFrame
from pandas.core.index import Index, MultiIndex, NULL_INDEX, _ensure_index
Expand Down Expand Up @@ -1638,7 +1638,8 @@ def reindex_like(self, other, method=None, copy=True):

truncate = generic.truncate

def set_index(self, col_or_cols, drop=True, inplace=False):
def set_index(self, col_or_cols, drop=True, inplace=False,
verify_integrity=True):
"""
Set the DataFrame index (row labels) using one or more existing
columns. By default yields a new object.
Expand All @@ -1650,6 +1651,10 @@ def set_index(self, col_or_cols, drop=True, inplace=False):
Delete columns to be used as the new index
inplace : boolean, default False
Modify the DataFrame in place (do not create a new object)
verify_integrity : boolean, default True
Check the new index for duplicates. Otherwise defer the check until
necessary. Setting to False will improve the performance of this
method
Returns
-------
Expand All @@ -1674,8 +1679,8 @@ def set_index(self, col_or_cols, drop=True, inplace=False):

index = MultiIndex.from_arrays(arrays, names=cols)

if not index._verify_integrity():
duplicates = index._get_duplicates()
if verify_integrity and not index._verify_integrity():
duplicates = index.get_duplicates()
raise Exception('Index has duplicate keys: %s' % duplicates)

# clear up memory usage
Expand Down Expand Up @@ -2738,60 +2743,13 @@ def append(self, other, ignore_index=False):
if not self:
return other.copy()

if ignore_index:
new_index = None
from pandas.tools.merge import concat
if isinstance(other, list):
to_concat = [self] + other
else:
new_index = self.index.append(other.index)
assert(new_index._verify_integrity())

if self.columns.equals(other.columns):
return self._append_same_columns(other, new_index)
else:
return self._append_different_columns(other, new_index)

def _append_different_columns(self, other, new_index):
indexer = self.columns.get_indexer(other.columns)

if not (indexer == -1).any():
new_columns = self.columns
else:
new_columns = self.columns.union(other.columns)

new_data = self._append_column_by_column(other)
return self._constructor(data=new_data, index=new_index,
columns=new_columns)

def _append_same_columns(self, other, new_index):
if self._is_mixed_type:
new_data = self._append_column_by_column(other)
else:
new_data = np.concatenate((self.values, other.values), axis=0)
return self._constructor(new_data, index=new_index,
columns=self.columns)

def _append_column_by_column(self, other):
def _concat_missing(values, n):
values = _maybe_upcast(values)
missing_values = np.empty(n, dtype=values.dtype)
missing_values.fill(np.nan)
return values, missing_values

new_data = {}
for col in self:
values = self._get_raw_column(col)
if col in other:
other_values = other._get_raw_column(col)
else:
values, other_values = _concat_missing(values, len(other))
new_data[col] = np.concatenate((values, other_values))

for col in other:
values = other._get_raw_column(col)
if col not in self:
values, missing_values = _concat_missing(values, len(self))
new_data[col] = np.concatenate((missing_values, values))

return new_data
to_concat = [self, other]
return concat(to_concat, ignore_index=ignore_index,
verify_integrity=True)

def _get_raw_column(self, col):
return self._data.get(col)
Expand Down Expand Up @@ -3618,6 +3576,8 @@ def factor_agg(factor, vec, func):


def extract_index(data):
from pandas.core.index import _union_indexes

index = None
if len(data) == 0:
index = NULL_INDEX
Expand Down Expand Up @@ -3663,51 +3623,6 @@ def extract_index(data):
return _ensure_index(index)


def _union_indexes(indexes):
if len(indexes) == 0:
return Index([])

if len(indexes) == 1:
result = indexes[0]
if isinstance(result, list):
result = Index(sorted(result))
return result

indexes, kind = _sanitize_and_check(indexes)

if kind == 'special':
result = indexes[0]
for other in indexes[1:]:
result = result.union(other)
return result
elif kind == 'array':
index = indexes[0]
for other in indexes[1:]:
if not index.equals(other):
return Index(lib.fast_unique_multiple(indexes))

return index
else:
return Index(lib.fast_unique_multiple_list(indexes))


def _sanitize_and_check(indexes):
kinds = list(set([type(index) for index in indexes]))

if list in kinds:
if len(kinds) > 1:
indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x
for x in indexes]
kinds.remove(list)
else:
return indexes, 'list'


if len(kinds) > 1 or Index not in kinds:
return indexes, 'special'
else:
return indexes, 'array'


def _check_data_types(data):
have_raw_arrays = False
Expand Down
84 changes: 73 additions & 11 deletions pandas/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import numpy as np

from pandas.core.common import (adjoin as _adjoin, _stringify,
from pandas.core.common import (adjoin as _adjoin, _stringify, _try_sort,
_is_bool_indexer, _asarray_tuplesafe)
from pandas.util.decorators import cache_readonly
import pandas.core.common as com
Expand Down Expand Up @@ -119,6 +119,15 @@ def is_monotonic(self):
except TypeError:
return False

def get_duplicates(self):
from collections import defaultdict
counter = defaultdict(lambda: 0)
for k in self.values:
counter[k] += 1
return sorted(k for k, v in counter.iteritems() if v > 1)

_get_duplicates = get_duplicates

@property
def indexMap(self):
"{label -> location}"
Expand All @@ -143,13 +152,6 @@ def _get_level_number(self, level):
def _verify_integrity(self):
return self._engine.has_integrity

def _get_duplicates(self):
from collections import defaultdict
counter = defaultdict(lambda: 0)
for k in self.values:
counter[k] += 1
return sorted(k for k, v in counter.iteritems() if v > 1)

_allDates = None
def is_all_dates(self):
"""
Expand Down Expand Up @@ -1261,9 +1263,6 @@ def append(self, other):
appended : Index
"""
if isinstance(other, (list, tuple)):
for k in other:
assert(isinstance(k, MultiIndex))

to_concat = (self.values,) + tuple(k.values for k in other)
else:
to_concat = self.values, other.values
Expand Down Expand Up @@ -1871,3 +1870,66 @@ def _ensure_index(index_like):
def _validate_join_method(method):
if method not in ['left', 'right', 'inner', 'outer']:
raise Exception('do not recognize join method %s' % method)

# TODO: handle index names!

def _get_combined_index(indexes, intersect=False):
indexes = _get_distinct_indexes(indexes)
if len(indexes) == 1:
return indexes[0]
if intersect:
index = indexes[0]
for other in indexes[1:]:
index = index.intersection(other)
return index
union = _union_indexes(indexes)
return Index(union)

def _get_distinct_indexes(indexes):
return dict((id(x), x) for x in indexes).values()


def _union_indexes(indexes):
if len(indexes) == 0:
return Index([])

if len(indexes) == 1:
result = indexes[0]
if isinstance(result, list):
result = Index(sorted(result))
return result

indexes, kind = _sanitize_and_check(indexes)

if kind == 'special':
result = indexes[0]
for other in indexes[1:]:
result = result.union(other)
return result
elif kind == 'array':
index = indexes[0]
for other in indexes[1:]:
if not index.equals(other):
return Index(lib.fast_unique_multiple(indexes))

return index
else:
return Index(lib.fast_unique_multiple_list(indexes))


def _sanitize_and_check(indexes):
kinds = list(set([type(index) for index in indexes]))

if list in kinds:
if len(kinds) > 1:
indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x
for x in indexes]
kinds.remove(list)
else:
return indexes, 'list'


if len(kinds) > 1 or Index not in kinds:
return indexes, 'special'
else:
return indexes, 'array'
49 changes: 8 additions & 41 deletions pandas/core/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,11 @@

from pandas.core.common import (PandasError, _mut_exclusive,
_try_sort, _default_index, _infer_dtype)
from pandas.core.index import Factor, Index, MultiIndex, _ensure_index
from pandas.core.index import (Factor, Index, MultiIndex, _ensure_index,
_get_combined_index, _union_indexes)
from pandas.core.indexing import _NDFrameIndexer
from pandas.core.internals import BlockManager, make_block, form_blocks
from pandas.core.frame import DataFrame, _union_indexes
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
from pandas.util import py3compat
from pandas.util.decorators import deprecate
Expand Down Expand Up @@ -1152,52 +1153,18 @@ def _homogenize_dict(frames, intersect=True, dtype=None):
else:
adj_frames[k] = v

index = _get_combined_index(adj_frames, intersect=intersect)
columns = _get_combined_columns(adj_frames, intersect=intersect)
all_indexes = [df.index for df in adj_frames.values()]
all_columns = [df.columns for df in adj_frames.values()]

index = _get_combined_index(all_indexes, intersect=intersect)
columns = _get_combined_index(all_columns, intersect=intersect)

for key, frame in adj_frames.iteritems():
result[key] = frame.reindex(index=index, columns=columns,
copy=False)

return result, index, columns

def _get_combined_columns(frames, intersect=False):
columns = None

if intersect:
combine = set.intersection
else:
combine = set.union

for _, frame in frames.iteritems():
this_cols = set(frame.columns)

if columns is None:
columns = this_cols
else:
columns = combine(columns, this_cols)

return Index(sorted(columns))

def _get_combined_index(frames, intersect=False):
from pandas.core.frame import _union_indexes

indexes = _get_distinct_indexes([df.index for df in frames.values()])
if len(indexes) == 1:
return indexes[0]
if intersect:
index = indexes[0]
for other in indexes[1:]:
index = index.intersection(other)
return index
union = _union_indexes(indexes)
return Index(union)

def _get_distinct_indexes(indexes):
from itertools import groupby
indexes = sorted(indexes, key=id)
return [gp.next() for _, gp in groupby(indexes, id)]

def _monotonic(arr):
return not (arr[1:] < arr[:-1]).any()

Expand Down
2 changes: 1 addition & 1 deletion pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -413,7 +413,7 @@ def get_chunk(self, rows=None):
index = Index(np.arange(len(content)))

if not index._verify_integrity():
dups = index._get_duplicates()
dups = index.get_duplicates()
raise Exception('Index has duplicates: %s' % str(dups))

if len(self.columns) != len(zipped_content):
Expand Down
8 changes: 5 additions & 3 deletions pandas/sparse/panel.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,7 +426,7 @@ def minor_xs(self, key):
SparseWidePanel = SparsePanel

def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'):
from pandas.core.panel import _get_combined_index, _get_combined_columns
from pandas.core.panel import _get_combined_index
output = {}
for item, df in frames.iteritems():
if not isinstance(df, SparseDataFrame):
Expand All @@ -436,9 +436,11 @@ def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'):
output[item] = df

if index is None:
index = _get_combined_index(output)
all_indexes = [df.index for df in output.values()]
index = _get_combined_index(all_indexes)
if columns is None:
columns = _get_combined_columns(output)
all_columns = [df.columns for df in output.values()]
columns = _get_combined_index(all_columns)

index = _ensure_index(index)
columns = _ensure_index(columns)
Expand Down
Loading

0 comments on commit 3b1c5b7

Please sign in to comment.