Skip to content

Commit

Permalink
Merge PR #1566
Browse files Browse the repository at this point in the history
  • Loading branch information
wesm committed Jul 11, 2012
2 parents 6195a32 + b9cf9dd commit 5ca7425
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 2 deletions.
3 changes: 2 additions & 1 deletion RELEASE.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@ pandas 0.8.1
**New features**

- Can pass dict of per-column line styles to DataFrame.plot (#1559)
- Add new ``bootstrap_plot`` function
- Add new ``bootstrap_plot`` plot function
- Add new ``parallel_coordinates`` plot function (#1488)

**Improvements to existing features**

Expand Down
28 changes: 27 additions & 1 deletion doc/source/visualization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ Bootstrap Plot

Bootstrap plots are used to visually assess the uncertainty of a statistic, such
as mean, median, midrange, etc. A random subset of a specified size is selected
from a data set, the statistic in question is computed for this subset and the
from a data set, the statistic in question is computed for this subset and the
process is repeated a specified number of times. Resulting plots and histograms
are what constitutes the bootstrap plot.

Expand All @@ -380,3 +380,29 @@ are what constitutes the bootstrap plot.
@savefig bootstrap_plot.png width=8in
bootstrap_plot(data, size=50, samples=500, color='grey')
RadViz
~~~~~~

RadViz is a way of visualizing multi-variate data. It is based on a simple
spring tension minimization algorithm. Basically you set up a bunch of points in
a plane. In our case they are equally spaced on a unit circle. Each point
represents a single attribute. You then pretend that each sample in the data set
is attached to each of these points by a spring, the stiffness of which is
proportional to the numerical value of that attribute (they are normalized to
unit interval). The point in the plane, where our sample settles to (where the
forces acting on our sample are at an equilibrium) is where a dot representing
our sample will be drawn. Depending on which class that sample belongs it will
be colored differently.

.. ipython:: python
from pandas import read_csv
from pandas.tools.plotting import radviz
data = read_csv('data/iris.data')
plt.figure()
@savefig radviz.png width=6in
radviz(data, 'Name')
8 changes: 8 additions & 0 deletions pandas/tests/test_graphics.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,14 @@ def test_parallel_coordinates(self):
df = read_csv(path)
_check_plot_works(parallel_coordinates, df, 'Name')

@slow
def test_radviz(self):
from pandas import read_csv
from pandas.tools.plotting import radviz
path = os.path.join(curpath(), 'data/iris.csv')
df = read_csv(path)
_check_plot_works(radviz, df, 'Name')

@slow
def test_plot_int_columns(self):
df = DataFrame(np.random.randn(100, 4)).cumsum()
Expand Down
59 changes: 59 additions & 0 deletions pandas/tools/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,65 @@ def _get_marker_compat(marker):
return 'o'
return marker

def radviz(frame, class_column, ax=None, **kwds):
"""RadViz - a multivariate data visualization algorithm
Parameters:
-----------
frame: DataFrame object
class_column: Column name that contains information about class membership
ax: Matplotlib axis object, optional
kwds: Matplotlib scatter method keyword arguments, optional
Returns:
--------
ax: Matplotlib axis object
"""
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.text as text
import random
def random_color(column):
random.seed(column)
return [random.random() for _ in range(3)]
def normalize(series):
a = min(series)
b = max(series)
return (series - a) / (b - a)
column_names = [column_name for column_name in frame.columns if column_name != class_column]
columns = [normalize(frame[column_name]) for column_name in column_names]
if ax == None:
ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1])
classes = set(frame[class_column])
to_plot = {}
for class_ in classes:
to_plot[class_] = [[], []]
n = len(frame.columns) - 1
s = np.array([(np.cos(t), np.sin(t)) for t in [2.0 * np.pi * (i / float(n)) for i in range(n)]])
for i in range(len(frame)):
row = np.array([column[i] for column in columns])
row_ = np.repeat(np.expand_dims(row, axis=1), 2, axis=1)
y = (s * row_).sum(axis=0) / row.sum()
class_name = frame[class_column][i]
to_plot[class_name][0].append(y[0])
to_plot[class_name][1].append(y[1])
for class_ in classes:
ax.scatter(to_plot[class_][0], to_plot[class_][1], color=random_color(class_), label=str(class_), **kwds)
ax.add_patch(patches.Circle((0.0, 0.0), radius=1.0, facecolor='none'))
for xy, name in zip(s, column_names):
ax.add_patch(patches.Circle(xy, radius=0.025, facecolor='gray'))
if xy[0] < 0.0 and xy[1] < 0.0:
ax.text(xy[0] - 0.025, xy[1] - 0.025, name, ha='right', va='top', size='small')
elif xy[0] < 0.0 and xy[1] >= 0.0:
ax.text(xy[0] - 0.025, xy[1] + 0.025, name, ha='right', va='bottom', size='small')
elif xy[0] >= 0.0 and xy[1] < 0.0:
ax.text(xy[0] + 0.025, xy[1] - 0.025, name, ha='left', va='top', size='small')
elif xy[0] >= 0.0 and xy[1] >= 0.0:
ax.text(xy[0] + 0.025, xy[1] + 0.025, name, ha='left', va='bottom', size='small')
ax.legend(loc='upper right')
ax.axis('equal')
return ax

def andrews_curves(data, class_column, ax=None, samples=200):
"""
Parameters:
Expand Down

0 comments on commit 5ca7425

Please sign in to comment.