From 665c1c469e624db106a97c08486a43001555c036 Mon Sep 17 00:00:00 2001 From: Vytautas Jancauskas Date: Sun, 17 Jun 2012 20:55:37 +0300 Subject: [PATCH 1/3] Added parallel coordinates plotting --- pandas/tools/plotting.py | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/pandas/tools/plotting.py b/pandas/tools/plotting.py index 515434edda6b0..284c9bfb54c88 100644 --- a/pandas/tools/plotting.py +++ b/pandas/tools/plotting.py @@ -178,6 +178,53 @@ def random_color(column): ax.grid() return ax +def parallel_coordinates(data, class_column, cols=None, ax=None, **kwds): + """Parallel coordinates plotting. + + Parameters: + ----------- + data: A DataFrame containing data to be plotted + class_column: Column name containing class names + cols: A list of column names to use, optional + ax: matplotlib axis object, optional + kwds: A list of keywords for matplotlib plot method + + Returns: + -------- + ax: matplotlib axis object + """ + import matplotlib.pyplot as plt + import random + def random_color(column): + random.seed(column) + return [random.random() for _ in range(3)] + n = len(data) + classes = set(data[class_column]) + class_col = data[class_column] + if cols == None: + columns = [data[col] for col in data.columns if (col != class_column)] + else: + columns = [data[col] for col in cols] + used_legends = set([]) + x = range(len(columns)) + if ax == None: + ax = plt.gca() + for i in range(n): + row = [columns[c][i] for c in range(len(columns))] + y = row + label = None + if str(class_col[i]) not in used_legends: + label = str(class_col[i]) + used_legends.add(label) + ax.plot(x, y, color=random_color(class_col[i]), label=label, **kwds) + for i, col in enumerate(columns): + ax.axvline(i, linewidth=1, color='black') + ax.set_xticks(range(len(columns))) + ax.set_xticklabels([col for col in data.columns if col != class_column]) + ax.legend(loc='upper right') + ax.grid() + return ax + def lag_plot(series, ax=None, **kwds): """Lag plot for time series. From b74a7cee67524016333597491798964068ee286f Mon Sep 17 00:00:00 2001 From: Vytautas Jancauskas Date: Sun, 17 Jun 2012 21:02:03 +0300 Subject: [PATCH 2/3] Added tests for parallel coordinates plotting --- pandas/tests/test_graphics.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/test_graphics.py b/pandas/tests/test_graphics.py index 8326445f28cb0..8c1b512622513 100644 --- a/pandas/tests/test_graphics.py +++ b/pandas/tests/test_graphics.py @@ -271,6 +271,14 @@ def test_andrews_curves(self): df = read_csv(path) _check_plot_works(andrews_curves, df, 'Name') + @slow + def test_parallel_coordinates(self): + from pandas import read_csv + from pandas.tools.plotting import parallel_coordinates + path = os.path.join(curpath(), 'data/iris.csv') + df = read_csv(path) + _check_plot_works(parallel_coordinates, df, 'Name') + @slow def test_plot_int_columns(self): df = DataFrame(np.random.randn(100, 4)).cumsum() From f09b6a653f5362cabd527c737dcdb575e33ddb5b Mon Sep 17 00:00:00 2001 From: Vytautas Jancauskas Date: Sun, 17 Jun 2012 21:14:23 +0300 Subject: [PATCH 3/3] Added parallel coordinates documentation --- doc/source/visualization.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/source/visualization.rst b/doc/source/visualization.rst index 34bd803516468..dc89d99749d39 100644 --- a/doc/source/visualization.rst +++ b/doc/source/visualization.rst @@ -284,6 +284,27 @@ of the same class will usually be closer together and form larger structures. @savefig andrews_curves.png width=6in andrews_curves(data, 'Name') +Parallel Coordinates +~~~~~~~~~~~~~~~~~~~~ + +Parallel coordinates is a plotting technique for plotting multivariate data. +It allows one to see clusters in data and to estimate other statistics visually. +Using parallel coordinates points are represented as connected line segments. +Each vertical line represents one attribute. One set of connected line segments +represents one data point. Points that tend to cluster will appear closer together. + +.. ipython:: python + + from pandas import read_csv + from pandas.tools.plotting import parallel_coordinates + + data = read_csv('data/iris.data') + + plt.figure() + + @savefig parallel_coordinates.png width=6in + parallel_coordinates(data, 'Name') + Lag Plot ~~~~~~~~