Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parallel coordinates plot #1488

Merged
merged 3 commits into from
Jul 11, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions doc/source/visualization.rst
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,27 @@ of the same class will usually be closer together and form larger structures.
@savefig andrews_curves.png width=6in
andrews_curves(data, 'Name')

Parallel Coordinates
~~~~~~~~~~~~~~~~~~~~

Parallel coordinates is a plotting technique for plotting multivariate data.
It allows one to see clusters in data and to estimate other statistics visually.
Using parallel coordinates points are represented as connected line segments.
Each vertical line represents one attribute. One set of connected line segments
represents one data point. Points that tend to cluster will appear closer together.

.. ipython:: python

from pandas import read_csv
from pandas.tools.plotting import parallel_coordinates

data = read_csv('data/iris.data')

plt.figure()

@savefig parallel_coordinates.png width=6in
parallel_coordinates(data, 'Name')

Lag Plot
~~~~~~~~

Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/test_graphics.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,14 @@ def test_andrews_curves(self):
df = read_csv(path)
_check_plot_works(andrews_curves, df, 'Name')

@slow
def test_parallel_coordinates(self):
from pandas import read_csv
from pandas.tools.plotting import parallel_coordinates
path = os.path.join(curpath(), 'data/iris.csv')
df = read_csv(path)
_check_plot_works(parallel_coordinates, df, 'Name')

@slow
def test_plot_int_columns(self):
df = DataFrame(np.random.randn(100, 4)).cumsum()
Expand Down
47 changes: 47 additions & 0 deletions pandas/tools/plotting.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,53 @@ def random_color(column):
ax.grid()
return ax

def parallel_coordinates(data, class_column, cols=None, ax=None, **kwds):
"""Parallel coordinates plotting.

Parameters:
-----------
data: A DataFrame containing data to be plotted
class_column: Column name containing class names
cols: A list of column names to use, optional
ax: matplotlib axis object, optional
kwds: A list of keywords for matplotlib plot method

Returns:
--------
ax: matplotlib axis object
"""
import matplotlib.pyplot as plt
import random
def random_color(column):
random.seed(column)
return [random.random() for _ in range(3)]
n = len(data)
classes = set(data[class_column])
class_col = data[class_column]
if cols == None:
columns = [data[col] for col in data.columns if (col != class_column)]
else:
columns = [data[col] for col in cols]
used_legends = set([])
x = range(len(columns))
if ax == None:
ax = plt.gca()
for i in range(n):
row = [columns[c][i] for c in range(len(columns))]
y = row
label = None
if str(class_col[i]) not in used_legends:
label = str(class_col[i])
used_legends.add(label)
ax.plot(x, y, color=random_color(class_col[i]), label=label, **kwds)
for i, col in enumerate(columns):
ax.axvline(i, linewidth=1, color='black')
ax.set_xticks(range(len(columns)))
ax.set_xticklabels([col for col in data.columns if col != class_column])
ax.legend(loc='upper right')
ax.grid()
return ax

def lag_plot(series, ax=None, **kwds):
"""Lag plot for time series.

Expand Down