From c2aed578cfc642d00fd81b5b52cd9541dd4c0444 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 28 Oct 2015 18:58:39 -0700 Subject: [PATCH 1/9] Run through ch02 and ch04 --- ch02.ipynb | 2704 ++++++++++++++++++++++++++------------------------ ch04.ipynb | 2789 +++++++++++++++++++++++++++------------------------- 2 files changed, 2839 insertions(+), 2654 deletions(-) diff --git a/ch02.ipynb b/ch02.ipynb index a57f33d91..9b7e4483f 100644 --- a/ch02.ipynb +++ b/ch02.ipynb @@ -1,1309 +1,1403 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introductory examples" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1.usa.gov data from bit.ly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "open(path).readline()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import json\n", + "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'\n", + "records = [json.loads(line) for line in open(path)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "records[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "records[0]['tz']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(records[0]['tz'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Counting time zones in pure Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "time_zones = [rec['tz'] for rec in records]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "time_zones = [rec['tz'] for rec in records if 'tz' in rec]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "time_zones[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_counts(sequence):\n", + " counts = {}\n", + " for x in sequence:\n", + " if x in counts:\n", + " counts[x] += 1\n", + " else:\n", + " counts[x] = 1\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "def get_counts2(sequence):\n", + " counts = defaultdict(int) # values will initialize to 0\n", + " for x in sequence:\n", + " counts[x] += 1\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "counts = get_counts(time_zones)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "counts['America/New_York']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "len(time_zones)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def top_counts(count_dict, n=10):\n", + " value_key_pairs = [(count, tz) for tz, count in count_dict.items()]\n", + " value_key_pairs.sort()\n", + " return value_key_pairs[-n:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "top_counts(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import Counter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "counts = Counter(time_zones)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "counts.most_common(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Counting time zones with pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import json\n", + "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'\n", + "lines = open(path).readlines()\n", + "records = [json.loads(line) for line in lines]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas import DataFrame, Series\n", + "import pandas as pd\n", + "\n", + "frame = DataFrame(records)\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame['tz'][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tz_counts = frame['tz'].value_counts()\n", + "tz_counts[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "clean_tz = frame['tz'].fillna('Missing')\n", + "clean_tz[clean_tz == ''] = 'Unknown'\n", + "tz_counts = clean_tz.value_counts()\n", + "tz_counts[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tz_counts[:10].plot(kind='barh', rot=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame['a'][1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame['a'][50]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame['a'][51]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "results = Series([x.split()[0] for x in frame.a.dropna()])\n", + "results[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "results.value_counts()[:8]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cframe = frame[frame.a.notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "operating_system = np.where(cframe['a'].str.contains('Windows'),\n", + " 'Windows', 'Not Windows')\n", + "operating_system[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_tz_os = cframe.groupby(['tz', operating_system])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "agg_counts = by_tz_os.size().unstack().fillna(0)\n", + "agg_counts[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Use to sort in ascending order\n", + "indexer = agg_counts.sum(1).argsort()\n", + "indexer[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "count_subset = agg_counts.take(indexer)[-10:]\n", + "count_subset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "count_subset.plot(kind='barh', stacked=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "normed_subset = count_subset.div(count_subset.sum(1), axis=0)\n", + "normed_subset.plot(kind='barh', stacked=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## MovieLens 1M data set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import os\n", + "encoding = 'latin1'\n", + "\n", + "upath = os.path.expanduser('ch02/movielens/users.dat')\n", + "rpath = os.path.expanduser('ch02/movielens/ratings.dat')\n", + "mpath = os.path.expanduser('ch02/movielens/movies.dat')\n", + "\n", + "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n", + "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n", + "mnames = ['movie_id', 'title', 'genres']\n", + "\n", + "users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)\n", + "ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)\n", + "movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "users[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ratings[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "movies[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ratings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = pd.merge(pd.merge(ratings, users), movies)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mean_ratings = data.pivot_table('rating', index='title',\n", + " columns='gender', aggfunc='mean')\n", + "mean_ratings[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ratings_by_title = data.groupby('title').size()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ratings_by_title[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "active_titles = ratings_by_title.index[ratings_by_title >= 250]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "active_titles[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mean_ratings = mean_ratings.ix[active_titles]\n", + "mean_ratings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':\n", + " 'Seven Samurai (Shichinin no samurai) (1954)'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)\n", + "top_female_ratings[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Measuring rating disagreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sorted_by_diff = mean_ratings.sort_index(by='diff')\n", + "sorted_by_diff[:15]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Reverse order of rows, take first 15 rows\n", + "sorted_by_diff[::-1][:15]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Standard deviation of rating grouped by title\n", + "rating_std_by_title = data.groupby('title')['rating'].std()\n", + "# Filter down to active_titles\n", + "rating_std_by_title = rating_std_by_title.ix[active_titles]\n", + "# Order Series by value in descending order\n", + "rating_std_by_title.order(ascending=False)[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### US Baby Names 1880-2010" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(12, 5))\n", + "np.set_printoptions(precision=4)\n", + "%pwd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "http://www.ssa.gov/oact/babynames/limits.html" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!head -n 10 ch02/names/yob1880.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])\n", + "names1880" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names1880.groupby('sex').births.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 2010 is the last available year right now\n", + "years = range(1880, 2011)\n", + "\n", + "pieces = []\n", + "columns = ['name', 'sex', 'births']\n", + "\n", + "for year in years:\n", + " path = 'ch02/names/yob%d.txt' % year\n", + " frame = pd.read_csv(path, names=columns)\n", + "\n", + " frame['year'] = year\n", + " pieces.append(frame)\n", + "\n", + "# Concatenate everything into a single DataFrame\n", + "names = pd.concat(pieces, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "total_births = names.pivot_table('births', index='year',\n", + " columns='sex', aggfunc=sum)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "total_births.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "total_births.plot(title='Total births by sex and year')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def add_prop(group):\n", + " # Integer division floors\n", + " births = group.births.astype(float)\n", + "\n", + " group['prop'] = births / births.sum()\n", + " return group\n", + "names = names.groupby(['year', 'sex']).apply(add_prop)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_top1000(group):\n", + " return group.sort_index(by='births', ascending=False)[:1000]\n", + "grouped = names.groupby(['year', 'sex'])\n", + "top1000 = grouped.apply(get_top1000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pieces = []\n", + "for year, group in names.groupby(['year', 'sex']):\n", + " pieces.append(group.sort_index(by='births', ascending=False)[:1000])\n", + "top1000 = pd.concat(pieces, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "top1000.index = np.arange(len(top1000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "top1000" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Analyzing naming trends" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "boys = top1000[top1000.sex == 'M']\n", + "girls = top1000[top1000.sex == 'F']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "total_births = top1000.pivot_table('births', index='year', columns='name',\n", + " aggfunc=sum)\n", + "total_births" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]\n", + "subset.plot(subplots=True, figsize=(12, 10), grid=False,\n", + " title=\"Number of births per year\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Measuring the increase in naming diversity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "table = top1000.pivot_table('prop', index='year',\n", + " columns='sex', aggfunc=sum)\n", + "table.plot(title='Sum of table1000.prop by year and sex',\n", + " yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = boys[boys.year == 2010]\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()\n", + "prop_cumsum[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prop_cumsum.values.searchsorted(0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = boys[boys.year == 1900]\n", + "in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()\n", + "in1900.values.searchsorted(0.5) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_quantile_count(group, q=0.5):\n", + " group = group.sort_index(by='prop', ascending=False)\n", + " return group.prop.cumsum().values.searchsorted(q) + 1\n", + "\n", + "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n", + "diversity = diversity.unstack('sex')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_quantile_count(group, q=0.5):\n", + " group = group.sort_index(by='prop', ascending=False)\n", + " return group.prop.cumsum().values.searchsorted(q) + 1\n", + "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n", + "diversity = diversity.unstack('sex')\n", + "diversity.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "diversity.plot(title=\"Number of popular names in top 50%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### The \"Last letter\" Revolution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# extract last letter from name column\n", + "get_last_letter = lambda x: x[-1]\n", + "last_letters = names.name.map(get_last_letter)\n", + "last_letters.name = 'last_letter'\n", + "\n", + "table = names.pivot_table('births', index=last_letters,\n", + " columns=['sex', 'year'], aggfunc=sum)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "subtable = table.reindex(columns=[1910, 1960, 2010], level='year')\n", + "subtable.head()\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "subtable.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "letter_prop = subtable / subtable.sum().astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, axes = plt.subplots(2, 1, figsize=(10, 8))\n", + "letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')\n", + "letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',\n", + " legend=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.subplots_adjust(hspace=0.25)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "letter_prop = table / table.sum().astype(float)\n", + "\n", + "dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T\n", + "dny_ts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dny_ts.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Boy names that became girl names (and vice versa)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "all_names = top1000.name.unique()\n", + "mask = np.array(['lesl' in x.lower() for x in all_names])\n", + "lesley_like = all_names[mask]\n", + "lesley_like" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "filtered = top1000[top1000.name.isin(lesley_like)]\n", + "filtered.groupby('name').births.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "table = filtered.pivot_table('births', index='year',\n", + " columns='sex', aggfunc='sum')\n", + "table = table.div(table.sum(1), axis=0)\n", + "table.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "table.plot(style={'M': 'k-', 'F': 'k--'})" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:704e09c486537fba18109341ec0479cffd74de4b440d9a71f469b9b8ac474998" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Introductory examples" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "1.usa.gov data from bit.ly" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pwd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "open(path).readline()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import json\n", - "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'\n", - "records = [json.loads(line) for line in open(path)]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "records[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "records[0]['tz']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(records[0]['tz'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Counting time zones in pure Python" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "time_zones = [rec['tz'] for rec in records]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "time_zones = [rec['tz'] for rec in records if 'tz' in rec]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "time_zones[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_counts(sequence):\n", - " counts = {}\n", - " for x in sequence:\n", - " if x in counts:\n", - " counts[x] += 1\n", - " else:\n", - " counts[x] = 1\n", - " return counts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from collections import defaultdict\n", - "\n", - "def get_counts2(sequence):\n", - " counts = defaultdict(int) # values will initialize to 0\n", - " for x in sequence:\n", - " counts[x] += 1\n", - " return counts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "counts = get_counts(time_zones)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "counts['America/New_York']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "len(time_zones)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def top_counts(count_dict, n=10):\n", - " value_key_pairs = [(count, tz) for tz, count in count_dict.items()]\n", - " value_key_pairs.sort()\n", - " return value_key_pairs[-n:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "top_counts(counts)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from collections import Counter" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "counts = Counter(time_zones)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "counts.most_common(10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Counting time zones with pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%matplotlib inline" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "plt.rc('figure', figsize=(10, 6))\n", - "np.set_printoptions(precision=4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import json\n", - "path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'\n", - "lines = open(path).readlines()\n", - "records = [json.loads(line) for line in lines]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas import DataFrame, Series\n", - "import pandas as pd\n", - "\n", - "frame = DataFrame(records)\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['tz'][:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tz_counts = frame['tz'].value_counts()\n", - "tz_counts[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "clean_tz = frame['tz'].fillna('Missing')\n", - "clean_tz[clean_tz == ''] = 'Unknown'\n", - "tz_counts = clean_tz.value_counts()\n", - "tz_counts[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure(figsize=(10, 4))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tz_counts[:10].plot(kind='barh', rot=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['a'][1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['a'][50]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['a'][51]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results = Series([x.split()[0] for x in frame.a.dropna()])\n", - "results[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results.value_counts()[:8]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cframe = frame[frame.a.notnull()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "operating_system = np.where(cframe['a'].str.contains('Windows'),\n", - " 'Windows', 'Not Windows')\n", - "operating_system[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_tz_os = cframe.groupby(['tz', operating_system])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "agg_counts = by_tz_os.size().unstack().fillna(0)\n", - "agg_counts[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Use to sort in ascending order\n", - "indexer = agg_counts.sum(1).argsort()\n", - "indexer[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "count_subset = agg_counts.take(indexer)[-10:]\n", - "count_subset" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "count_subset.plot(kind='barh', stacked=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "normed_subset = count_subset.div(count_subset.sum(1), axis=0)\n", - "normed_subset.plot(kind='barh', stacked=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "MovieLens 1M data set" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas as pd\n", - "import os\n", - "encoding = 'latin1'\n", - "\n", - "upath = os.path.expanduser('ch02/movielens/users.dat')\n", - "rpath = os.path.expanduser('ch02/movielens/ratings.dat')\n", - "mpath = os.path.expanduser('ch02/movielens/movies.dat')\n", - "\n", - "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n", - "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n", - "mnames = ['movie_id', 'title', 'genres']\n", - "\n", - "users = pd.read_csv(upath, sep='::', header=None, names=unames, encoding=encoding)\n", - "ratings = pd.read_csv(rpath, sep='::', header=None, names=rnames, encoding=encoding)\n", - "movies = pd.read_csv(mpath, sep='::', header=None, names=mnames, encoding=encoding)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "users[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ratings[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "movies[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ratings" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.merge(pd.merge(ratings, users), movies)\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mean_ratings = data.pivot_table('rating', index='title',\n", - " columns='gender', aggfunc='mean')\n", - "mean_ratings[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ratings_by_title = data.groupby('title').size()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ratings_by_title[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": true, - "input": [ - "active_titles = ratings_by_title.index[ratings_by_title >= 250]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "active_titles[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mean_ratings = mean_ratings.ix[active_titles]\n", - "mean_ratings" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':\n", - " 'Seven Samurai (Shichinin no samurai) (1954)'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "top_female_ratings = mean_ratings.sort_index(by='F', ascending=False)\n", - "top_female_ratings[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Measuring rating disagreement" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sorted_by_diff = mean_ratings.sort_index(by='diff')\n", - "sorted_by_diff[:15]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Reverse order of rows, take first 15 rows\n", - "sorted_by_diff[::-1][:15]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Standard deviation of rating grouped by title\n", - "rating_std_by_title = data.groupby('title')['rating'].std()\n", - "# Filter down to active_titles\n", - "rating_std_by_title = rating_std_by_title.ix[active_titles]\n", - "# Order Series by value in descending order\n", - "rating_std_by_title.order(ascending=False)[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### US Baby Names 1880-2010" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "plt.rc('figure', figsize=(12, 5))\n", - "np.set_printoptions(precision=4)\n", - "%pwd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "http://www.ssa.gov/oact/babynames/limits.html" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!head -n 10 ch02/names/yob1880.txt" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas as pd\n", - "names1880 = pd.read_csv('ch02/names/yob1880.txt', names=['name', 'sex', 'births'])\n", - "names1880" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names1880.groupby('sex').births.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 2010 is the last available year right now\n", - "years = range(1880, 2011)\n", - "\n", - "pieces = []\n", - "columns = ['name', 'sex', 'births']\n", - "\n", - "for year in years:\n", - " path = 'names/names/yob%d.txt' % year\n", - " frame = pd.read_csv(path, names=columns)\n", - "\n", - " frame['year'] = year\n", - " pieces.append(frame)\n", - "\n", - "# Concatenate everything into a single DataFrame\n", - "names = pd.concat(pieces, ignore_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "total_births = names.pivot_table('births', index='year',\n", - " columns='sex', aggfunc=sum)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "total_births.tail()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "total_births.plot(title='Total births by sex and year')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def add_prop(group):\n", - " # Integer division floors\n", - " births = group.births.astype(float)\n", - "\n", - " group['prop'] = births / births.sum()\n", - " return group\n", - "names = names.groupby(['year', 'sex']).apply(add_prop)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.allclose(names.groupby(['year', 'sex']).prop.sum(), 1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_top1000(group):\n", - " return group.sort_index(by='births', ascending=False)[:1000]\n", - "grouped = names.groupby(['year', 'sex'])\n", - "top1000 = grouped.apply(get_top1000)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pieces = []\n", - "for year, group in names.groupby(['year', 'sex']):\n", - " pieces.append(group.sort_index(by='births', ascending=False)[:1000])\n", - "top1000 = pd.concat(pieces, ignore_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "top1000.index = np.arange(len(top1000))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "top1000" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Analyzing naming trends" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "boys = top1000[top1000.sex == 'M']\n", - "girls = top1000[top1000.sex == 'F']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "total_births = top1000.pivot_table('births', index='year', columns='name',\n", - " aggfunc=sum)\n", - "total_births" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]\n", - "subset.plot(subplots=True, figsize=(12, 10), grid=False,\n", - " title=\"Number of births per year\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Measuring the increase in naming diversity" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "table = top1000.pivot_table('prop', index='year',\n", - " columns='sex', aggfunc=sum)\n", - "table.plot(title='Sum of table1000.prop by year and sex',\n", - " yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = boys[boys.year == 2010]\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "prop_cumsum = df.sort_index(by='prop', ascending=False).prop.cumsum()\n", - "prop_cumsum[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "prop_cumsum.values.searchsorted(0.5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = boys[boys.year == 1900]\n", - "in1900 = df.sort_index(by='prop', ascending=False).prop.cumsum()\n", - "in1900.values.searchsorted(0.5) + 1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_quantile_count(group, q=0.5):\n", - " group = group.sort_index(by='prop', ascending=False)\n", - " return group.prop.cumsum().values.searchsorted(q) + 1\n", - "\n", - "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n", - "diversity = diversity.unstack('sex')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_quantile_count(group, q=0.5):\n", - " group = group.sort_index(by='prop', ascending=False)\n", - " return group.prop.cumsum().values.searchsorted(q) + 1\n", - "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n", - "diversity = diversity.unstack('sex')\n", - "diversity.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "diversity.plot(title=\"Number of popular names in top 50%\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "The \"Last letter\" Revolution" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# extract last letter from name column\n", - "get_last_letter = lambda x: x[-1]\n", - "last_letters = names.name.map(get_last_letter)\n", - "last_letters.name = 'last_letter'\n", - "\n", - "table = names.pivot_table('births', index=last_letters,\n", - " columns=['sex', 'year'], aggfunc=sum)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "subtable = table.reindex(columns=[1910, 1960, 2010], level='year')\n", - "subtable.head()\n" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "subtable.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "letter_prop = subtable / subtable.sum().astype(float)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import matplotlib.pyplot as plt\n", - "\n", - "fig, axes = plt.subplots(2, 1, figsize=(10, 8))\n", - "letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')\n", - "letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',\n", - " legend=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.subplots_adjust(hspace=0.25)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "letter_prop = table / table.sum().astype(float)\n", - "\n", - "dny_ts = letter_prop.ix[['d', 'n', 'y'], 'M'].T\n", - "dny_ts.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dny_ts.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Boy names that became girl names (and vice versa)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "all_names = top1000.name.unique()\n", - "mask = np.array(['lesl' in x.lower() for x in all_names])\n", - "lesley_like = all_names[mask]\n", - "lesley_like" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "filtered = top1000[top1000.name.isin(lesley_like)]\n", - "filtered.groupby('name').births.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "table = filtered.pivot_table('births', index='year',\n", - " columns='sex', aggfunc='sum')\n", - "table = table.div(table.sum(1), axis=0)\n", - "table.tail()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "table.plot(style={'M': 'k-', 'F': 'k--'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/ch04.ipynb b/ch04.ipynb index 778fa44ef..f13f5c4b2 100644 --- a/ch04.ipynb +++ b/ch04.ipynb @@ -1,1353 +1,1444 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NumPy Basics: Arrays and Vectorized Computation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The NumPy ndarray: a multidimensional array object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = randn(2, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data\n", + "data * 10\n", + "data + data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.shape\n", + "data.dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating ndarrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data1 = [6, 7.5, 8, 0, 1]\n", + "arr1 = np.array(data1)\n", + "arr1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]\n", + "arr2 = np.array(data2)\n", + "arr2\n", + "arr2.ndim\n", + "arr2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr1.dtype\n", + "arr2.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.zeros(10)\n", + "np.zeros((3, 6))\n", + "np.empty((2, 3, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.arange(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Types for ndarrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr1 = np.array([1, 2, 3], dtype=np.float64)\n", + "arr2 = np.array([1, 2, 3], dtype=np.int32)\n", + "arr1.dtype\n", + "arr2.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.array([1, 2, 3, 4, 5])\n", + "arr.dtype\n", + "float_arr = arr.astype(np.float64)\n", + "float_arr.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])\n", + "arr\n", + "arr.astype(np.int32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)\n", + "numeric_strings.astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "int_array = np.arange(10)\n", + "calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)\n", + "int_array.astype(calibers.dtype)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "empty_uint32 = np.empty(8, dtype='u4')\n", + "empty_uint32" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operations between arrays and scalars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.array([[1., 2., 3.], [4., 5., 6.]])\n", + "arr\n", + "arr * arr\n", + "arr - arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "1 / arr\n", + "arr ** 0.5" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Basic indexing and slicing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "arr\n", + "arr[5]\n", + "arr[5:8]\n", + "arr[5:8] = 12\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr_slice = arr[5:8]\n", + "arr_slice[1] = 12345\n", + "arr\n", + "arr_slice[:] = 64\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n", + "arr2d[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr2d[0][2]\n", + "arr2d[0, 2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])\n", + "arr3d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr3d[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "old_values = arr3d[0].copy()\n", + "arr3d[0] = 42\n", + "arr3d\n", + "arr3d[0] = old_values\n", + "arr3d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr3d[1, 0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Indexing with slices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[1:6]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr2d\n", + "arr2d[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr2d[:2, 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr2d[1, :2]\n", + "arr2d[2, :1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr2d[:, :1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr2d[:2, 1:] = 0" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Boolean indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])\n", + "data = randn(7, 4)\n", + "names\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names == 'Bob'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[names == 'Bob']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[names == 'Bob', 2:]\n", + "data[names == 'Bob', 3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names != 'Bob'\n", + "data[-(names == 'Bob')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mask = (names == 'Bob') | (names == 'Will')\n", + "mask\n", + "data[mask]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[data < 0] = 0\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[names != 'Joe'] = 7\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fancy indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.empty((8, 4))\n", + "for i in range(8):\n", + " arr[i] = i\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[[4, 3, 0, 6]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[[-3, -5, -7]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# more on reshape in Chapter 12\n", + "arr = np.arange(32).reshape((8, 4))\n", + "arr\n", + "arr[[1, 5, 7, 2], [0, 3, 1, 2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transposing arrays and swapping axes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(15).reshape((3, 5))\n", + "arr\n", + "arr.T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.random.randn(6, 3)\n", + "np.dot(arr.T, arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(16).reshape((2, 2, 4))\n", + "arr\n", + "arr.transpose((1, 0, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr\n", + "arr.swapaxes(1, 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Universal Functions: Fast element-wise array functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "np.sqrt(arr)\n", + "np.exp(arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "x = randn(8)\n", + "y = randn(8)\n", + "x\n", + "y\n", + "np.maximum(x, y) # element-wise maximum" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(7) * 5\n", + "np.modf(arr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data processing using arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "points = np.arange(-5, 5, 0.01) # 1000 equally spaced points\n", + "xs, ys = np.meshgrid(points, points)\n", + "ys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from matplotlib.pyplot import imshow, title" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "z = np.sqrt(xs ** 2 + ys ** 2)\n", + "z\n", + "plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()\n", + "plt.title(\"Image plot of $\\sqrt{x^2 + y^2}$ for a grid of values\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.draw()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Expressing conditional logic as array operations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])\n", + "yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])\n", + "cond = np.array([True, False, True, True, False])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = [(x if c else y)\n", + " for x, y, c in zip(xarr, yarr, cond)]\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = np.where(cond, xarr, yarr)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(4, 4)\n", + "arr\n", + "np.where(arr > 0, 2, -2)\n", + "np.where(arr > 0, 2, arr) # set only positive values to 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Not to be executed\n", + "\n", + "result = []\n", + "for i in range(n):\n", + " if cond1[i] and cond2[i]:\n", + " result.append(0)\n", + " elif cond1[i]:\n", + " result.append(1)\n", + " elif cond2[i]:\n", + " result.append(2)\n", + " else:\n", + " result.append(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Not to be executed\n", + "\n", + "np.where(cond1 & cond2, 0,\n", + " np.where(cond1, 1,\n", + " np.where(cond2, 2, 3)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Not to be executed\n", + "\n", + "result = 1 * cond1 + 2 * cond2 + 3 * -(cond1 | cond2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Mathematical and statistical methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.random.randn(5, 4) # normally-distributed data\n", + "arr.mean()\n", + "np.mean(arr)\n", + "arr.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr.mean(axis=1)\n", + "arr.sum(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])\n", + "arr.cumsum(0)\n", + "arr.cumprod(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Methods for boolean arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(100)\n", + "(arr > 0).sum() # Number of positive values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bools = np.array([False, False, True, False])\n", + "bools.any()\n", + "bools.all()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sorting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(8)\n", + "arr\n", + "arr.sort()\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(5, 3)\n", + "arr\n", + "arr.sort(1)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "large_arr = randn(1000)\n", + "large_arr.sort()\n", + "large_arr[int(0.05 * len(large_arr))] # 5% quantile" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unique and other set logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])\n", + "np.unique(names)\n", + "ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])\n", + "np.unique(ints)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sorted(set(names))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "values = np.array([6, 0, 0, 3, 2, 5, 6])\n", + "np.in1d(values, [2, 3, 6])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## File input and output with arrays" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Storing arrays on disk in binary format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "np.save('some_array', arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.load('some_array.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.savez('array_archive.npz', a=arr, b=arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arch = np.load('array_archive.npz')\n", + "arch['b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!rm some_array.npy\n", + "!rm array_archive.npz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving and loading text files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!cat array_ex.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.loadtxt('array_ex.txt', delimiter=',')\n", + "arr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Linear algebra" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "x = np.array([[1., 2., 3.], [4., 5., 6.]])\n", + "y = np.array([[6., 23.], [-1, 7], [8, 9]])\n", + "x\n", + "y\n", + "x.dot(y) # equivalently np.dot(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.dot(x, np.ones(3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.random.seed(12345)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from numpy.linalg import inv, qr\n", + "X = randn(5, 5)\n", + "mat = X.T.dot(X)\n", + "inv(mat)\n", + "mat.dot(inv(mat))\n", + "q, r = qr(mat)\n", + "r" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random number generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "samples = np.random.normal(size=(4, 4))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from random import normalvariate\n", + "N = 1000000\n", + "%timeit samples = [normalvariate(0, 1) for _ in xrange(N)]\n", + "%timeit np.random.normal(size=N)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example: Random Walks" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "import random\n", + "position = 0\n", + "walk = [position]\n", + "steps = 1000\n", + "for i in xrange(steps):\n", + " step = 1 if random.randint(0, 1) else -1\n", + " position += step\n", + " walk.append(position)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.random.seed(12345)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nsteps = 1000\n", + "draws = np.random.randint(0, 2, size=nsteps)\n", + "steps = np.where(draws > 0, 1, -1)\n", + "walk = steps.cumsum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "walk.min()\n", + "walk.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "(np.abs(walk) >= 10).argmax()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Simulating many random walks at once" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nwalks = 5000\n", + "nsteps = 1000\n", + "draws = np.random.randint(0, 2, size=(nwalks, nsteps)) # 0 or 1\n", + "steps = np.where(draws > 0, 1, -1)\n", + "walks = steps.cumsum(1)\n", + "walks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "walks.max()\n", + "walks.min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "hits30 = (np.abs(walks) >= 30).any(1)\n", + "hits30\n", + "hits30.sum() # Number that hit 30 or -30" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)\n", + "crossing_times.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "steps = np.random.normal(loc=0, scale=0.25,\n", + " size=(nwalks, nsteps))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], "metadata": { - "name": "", - "signature": "sha256:a144a882df0b168305dd22b779ee9f996375da0b2979c0c26a19c787c55767b2" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "NumPy Basics: Arrays and Vectorized Computation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%matplotlib inline" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "np.set_printoptions(precision=4, suppress=True)\n", - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "The NumPy ndarray: a multidimensional array object" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = randn(2, 3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data\n", - "data * 10\n", - "data + data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.shape\n", - "data.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Creating ndarrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data1 = [6, 7.5, 8, 0, 1]\n", - "arr1 = np.array(data1)\n", - "arr1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]\n", - "arr2 = np.array(data2)\n", - "arr2\n", - "arr2.ndim\n", - "arr2.shape" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr1.dtype\n", - "arr2.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.zeros(10)\n", - "np.zeros((3, 6))\n", - "np.empty((2, 3, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.arange(15)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Data Types for ndarrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr1 = np.array([1, 2, 3], dtype=np.float64)\n", - "arr2 = np.array([1, 2, 3], dtype=np.int32)\n", - "arr1.dtype\n", - "arr2.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([1, 2, 3, 4, 5])\n", - "arr.dtype\n", - "float_arr = arr.astype(np.float64)\n", - "float_arr.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])\n", - "arr\n", - "arr.astype(np.int32)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)\n", - "numeric_strings.astype(float)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "int_array = np.arange(10)\n", - "calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)\n", - "int_array.astype(calibers.dtype)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "empty_uint32 = np.empty(8, dtype='u4')\n", - "empty_uint32" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Operations between arrays and scalars" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([[1., 2., 3.], [4., 5., 6.]])\n", - "arr\n", - "arr * arr\n", - "arr - arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "1 / arr\n", - "arr ** 0.5" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Basic indexing and slicing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)\n", - "arr\n", - "arr[5]\n", - "arr[5:8]\n", - "arr[5:8] = 12\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_slice = arr[5:8]\n", - "arr_slice[1] = 12345\n", - "arr\n", - "arr_slice[:] = 64\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n", - "arr2d[2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[0][2]\n", - "arr2d[0, 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])\n", - "arr3d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr3d[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "old_values = arr3d[0].copy()\n", - "arr3d[0] = 42\n", - "arr3d\n", - "arr3d[0] = old_values\n", - "arr3d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr3d[1, 0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Indexing with slices" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[1:6]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d\n", - "arr2d[:2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[:2, 1:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[1, :2]\n", - "arr2d[2, :1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[:, :1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[:2, 1:] = 0" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Boolean indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])\n", - "data = randn(7, 4)\n", - "names\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names == 'Bob'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[names == 'Bob']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[names == 'Bob', 2:]\n", - "data[names == 'Bob', 3]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names != 'Bob'\n", - "data[-(names == 'Bob')]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mask = (names == 'Bob') | (names == 'Will')\n", - "mask\n", - "data[mask]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[data < 0] = 0\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[names != 'Joe'] = 7\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Fancy indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.empty((8, 4))\n", - "for i in range(8):\n", - " arr[i] = i\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[[4, 3, 0, 6]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[[-3, -5, -7]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# more on reshape in Chapter 12\n", - "arr = np.arange(32).reshape((8, 4))\n", - "arr\n", - "arr[[1, 5, 7, 2], [0, 3, 1, 2]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Transposing arrays and swapping axes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15).reshape((3, 5))\n", - "arr\n", - "arr.T" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(6, 3)\n", - "np.dot(arr.T, arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(16).reshape((2, 2, 4))\n", - "arr\n", - "arr.transpose((1, 0, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr\n", - "arr.swapaxes(1, 2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Universal Functions: Fast element-wise array functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)\n", - "np.sqrt(arr)\n", - "np.exp(arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x = randn(8)\n", - "y = randn(8)\n", - "x\n", - "y\n", - "np.maximum(x, y) # element-wise maximum" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(7) * 5\n", - "np.modf(arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data processing using arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "points = np.arange(-5, 5, 0.01) # 1000 equally spaced points\n", - "xs, ys = np.meshgrid(points, points)\n", - "ys" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from matplotlib.pyplot import imshow, title" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import matplotlib.pyplot as plt\n", - "z = np.sqrt(xs ** 2 + ys ** 2)\n", - "z\n", - "plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()\n", - "plt.title(\"Image plot of $\\sqrt{x^2 + y^2}$ for a grid of values\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.draw()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Expressing conditional logic as array operations" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])\n", - "yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])\n", - "cond = np.array([True, False, True, True, False])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = [(x if c else y)\n", - " for x, y, c in zip(xarr, yarr, cond)]\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = np.where(cond, xarr, yarr)\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(4, 4)\n", - "arr\n", - "np.where(arr > 0, 2, -2)\n", - "np.where(arr > 0, 2, arr) # set only positive values to 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Not to be executed\n", - "\n", - "result = []\n", - "for i in range(n):\n", - " if cond1[i] and cond2[i]:\n", - " result.append(0)\n", - " elif cond1[i]:\n", - " result.append(1)\n", - " elif cond2[i]:\n", - " result.append(2)\n", - " else:\n", - " result.append(3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Not to be executed\n", - "\n", - "np.where(cond1 & cond2, 0,\n", - " np.where(cond1, 1,\n", - " np.where(cond2, 2, 3)))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Not to be executed\n", - "\n", - "result = 1 * cond1 + 2 * cond2 + 3 * -(cond1 | cond2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Mathematical and statistical methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(5, 4) # normally-distributed data\n", - "arr.mean()\n", - "np.mean(arr)\n", - "arr.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.mean(axis=1)\n", - "arr.sum(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])\n", - "arr.cumsum(0)\n", - "arr.cumprod(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Methods for boolean arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(100)\n", - "(arr > 0).sum() # Number of positive values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bools = np.array([False, False, True, False])\n", - "bools.any()\n", - "bools.all()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Sorting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(8)\n", - "arr\n", - "arr.sort()\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(5, 3)\n", - "arr\n", - "arr.sort(1)\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "large_arr = randn(1000)\n", - "large_arr.sort()\n", - "large_arr[int(0.05 * len(large_arr))] # 5% quantile" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Unique and other set logic" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])\n", - "np.unique(names)\n", - "ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])\n", - "np.unique(ints)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sorted(set(names))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = np.array([6, 0, 0, 3, 2, 5, 6])\n", - "np.in1d(values, [2, 3, 6])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "File input and output with arrays" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Storing arrays on disk in binary format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)\n", - "np.save('some_array', arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.load('some_array.npy')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.savez('array_archive.npz', a=arr, b=arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arch = np.load('array_archive.npz')\n", - "arch['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!rm some_array.npy\n", - "!rm array_archive.npz" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Saving and loading text files" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat array_ex.txt" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.loadtxt('array_ex.txt', delimiter=',')\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Linear algebra" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x = np.array([[1., 2., 3.], [4., 5., 6.]])\n", - "y = np.array([[6., 23.], [-1, 7], [8, 9]])\n", - "x\n", - "y\n", - "x.dot(y) # equivalently np.dot(x, y)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.dot(x, np.ones(3))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy.linalg import inv, qr\n", - "X = randn(5, 5)\n", - "mat = X.T.dot(X)\n", - "inv(mat)\n", - "mat.dot(inv(mat))\n", - "q, r = qr(mat)\n", - "r" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Random number generation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "samples = np.random.normal(size=(4, 4))\n", - "samples" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from random import normalvariate\n", - "N = 1000000\n", - "%timeit samples = [normalvariate(0, 1) for _ in xrange(N)]\n", - "%timeit np.random.normal(size=N)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Example: Random Walks" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import random\n", - "position = 0\n", - "walk = [position]\n", - "steps = 1000\n", - "for i in xrange(steps):\n", - " step = 1 if random.randint(0, 1) else -1\n", - " position += step\n", - " walk.append(position)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nsteps = 1000\n", - "draws = np.random.randint(0, 2, size=nsteps)\n", - "steps = np.where(draws > 0, 1, -1)\n", - "walk = steps.cumsum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "walk.min()\n", - "walk.max()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "(np.abs(walk) >= 10).argmax()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Simulating many random walks at once" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nwalks = 5000\n", - "nsteps = 1000\n", - "draws = np.random.randint(0, 2, size=(nwalks, nsteps)) # 0 or 1\n", - "steps = np.where(draws > 0, 1, -1)\n", - "walks = steps.cumsum(1)\n", - "walks" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "walks.max()\n", - "walks.min()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "hits30 = (np.abs(walks) >= 30).any(1)\n", - "hits30\n", - "hits30.sum() # Number that hit 30 or -30" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)\n", - "crossing_times.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "steps = np.random.normal(loc=0, scale=0.25,\n", - " size=(nwalks, nsteps))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } From de841a184d67b660ad90d6ef82a3fa728a78e9b9 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 28 Oct 2015 19:00:27 -0700 Subject: [PATCH 2/9] Run through ch05 --- ch05.ipynb | 5283 +++++++++++++++++++++++++++------------------------- 1 file changed, 2735 insertions(+), 2548 deletions(-) diff --git a/ch05.ipynb b/ch05.ipynb index 48ef02f04..4ecbf6d81 100644 --- a/ch05.ipynb +++ b/ch05.ipynb @@ -1,2551 +1,2738 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Getting started with pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas import Series, DataFrame\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "np.random.seed(12345)\n", + "plt.rc('figure', figsize=(10, 6))\n", + "from pandas import Series, DataFrame\n", + "import pandas as pd\n", + "np.set_printoptions(precision=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pwd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction to pandas data structures" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series([4, 7, -5, 3])\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.values\n", + "obj.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])\n", + "obj2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj2.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj2['a']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj2['d'] = 6\n", + "obj2[['c', 'a', 'd']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj2[obj2 > 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj2 * 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.exp(obj2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "'b' in obj2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "'e' in obj2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n", + "obj3 = Series(sdata)\n", + "obj3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "states = ['California', 'Ohio', 'Oregon', 'Texas']\n", + "obj4 = Series(sdata, index=states)\n", + "obj4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.isnull(obj4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.notnull(obj4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj4.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj3 + obj4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj4.name = 'population'\n", + "obj4.index.name = 'state'\n", + "obj4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']\n", + "obj" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],\n", + " 'year': [2000, 2001, 2002, 2001, 2002],\n", + " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}\n", + "frame = DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "DataFrame(data, columns=['year', 'state', 'pop'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],\n", + " index=['one', 'two', 'three', 'four', 'five'])\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2['state']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2.year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2.ix['three']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2['debt'] = 16.5\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2['debt'] = np.arange(5.)\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n", + "frame2['debt'] = val\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2['eastern'] = frame2.state == 'Ohio'\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "del frame2['eastern']\n", + "frame2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n", + " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame3 = DataFrame(pop)\n", + "frame3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame3.T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "DataFrame(pop, index=[2001, 2002, 2003])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pdata = {'Ohio': frame3['Ohio'][:-1],\n", + " 'Nevada': frame3['Nevada'][:2]}\n", + "DataFrame(pdata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame3.index.name = 'year'; frame3.columns.name = 'state'\n", + "frame3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame3.values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2.values" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Index objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series(range(3), index=['a', 'b', 'c'])\n", + "index = obj.index\n", + "index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "index[1:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "index[1] = 'd'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "index = pd.Index(np.arange(3))\n", + "obj2 = Series([1.5, -2.5, 0], index=index)\n", + "obj2.index is index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "'Ohio' in frame3.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "2003 in frame3.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Essential functionality" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reindexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])\n", + "obj2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])\n", + "obj3.reindex(range(6), method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],\n", + " columns=['Ohio', 'Texas', 'California'])\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "states = ['Texas', 'Utah', 'California']\n", + "frame.reindex(columns=states)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',\n", + " columns=states)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.ix[['a', 'b', 'c', 'd'], states]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Dropping entries from an axis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])\n", + "new_obj = obj.drop('c')\n", + "new_obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.drop(['d', 'c'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame(np.arange(16).reshape((4, 4)),\n", + " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", + " columns=['one', 'two', 'three', 'four'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.drop(['Colorado', 'Ohio'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.drop('two', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.drop(['two', 'four'], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing, selection, and filtering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])\n", + "obj['b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj[2:4]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj[['b', 'a', 'd']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj[[1, 3]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj[obj < 2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj['b':'c']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj['b':'c'] = 5\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame(np.arange(16).reshape((4, 4)),\n", + " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", + " columns=['one', 'two', 'three', 'four'])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data['two']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[['three', 'one']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[data['three'] > 5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data < 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[data < 5] = 0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix['Colorado', ['two', 'three']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix[['Colorado', 'Utah'], [3, 0, 1]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix[:'Utah', 'two']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix[data.three > 5, :3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Arithmetic and data alignment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n", + "s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s1 + s2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),\n", + " index=['Ohio', 'Texas', 'Colorado'])\n", + "df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", + " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 + df2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Arithmetic methods with fill values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))\n", + "df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 + df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1.add(df2, fill_value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1.reindex(columns=df2.columns, fill_value=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Operations between DataFrame and Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(12.).reshape((3, 4))\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr - arr[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", + " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", + "series = frame.ix[0]\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame - series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "series2 = Series(range(3), index=['b', 'e', 'f'])\n", + "frame + series2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "series3 = frame['d']\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "series3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sub(series3, axis=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Function application and mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),\n", + " index=['Utah', 'Ohio', 'Texas', 'Oregon'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.abs(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "f = lambda x: x.max() - x.min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.apply(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.apply(f, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def f(x):\n", + " return Series([x.min(), x.max()], index=['min', 'max'])\n", + "frame.apply(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "format = lambda x: '%.2f' % x\n", + "frame.applymap(format)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame['e'].map(format)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Sorting and ranking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series(range(4), index=['d', 'a', 'b', 'c'])\n", + "obj.sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],\n", + " columns=['d', 'a', 'b', 'c'])\n", + "frame.sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sort_index(axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sort_index(axis=1, ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series([4, 7, -3, 2])\n", + "obj.order()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series([4, np.nan, 7, np.nan, -3, 2])\n", + "obj.order()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sort_index(by='b')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sort_index(by=['a', 'b'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series([7, -5, 7, 4, 2, 0, 4])\n", + "obj.rank()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.rank(method='first')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.rank(ascending=False, method='max')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],\n", + " 'c': [-2, 5, 8, -2.5]})\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.rank(axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Axis indexes with duplicate values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.index.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj['a']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj['c']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.ix['b']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Summarizing and computing descriptive statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame([[1.4, np.nan], [7.1, -4.5],\n", + " [np.nan, np.nan], [0.75, -1.3]],\n", + " index=['a', 'b', 'c', 'd'],\n", + " columns=['one', 'two'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.sum(axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.mean(axis=1, skipna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.idxmax()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.cumsum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series(['a', 'a', 'b', 'c'] * 4)\n", + "obj.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Correlation and covariance" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas.io.data as web\n", + "\n", + "all_data = {}\n", + "for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:\n", + " all_data[ticker] = web.get_data_yahoo(ticker)\n", + "\n", + "price = DataFrame({tic: data['Adj Close']\n", + " for tic, data in all_data.iteritems()})\n", + "volume = DataFrame({tic: data['Volume']\n", + " for tic, data in all_data.iteritems()})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns = price.pct_change()\n", + "returns.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns.MSFT.corr(returns.IBM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns.MSFT.cov(returns.IBM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns.corr()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns.cov()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns.corrwith(returns.IBM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns.corrwith(volume)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unique values, value counts, and membership" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "uniques = obj.unique()\n", + "uniques" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.value_counts(obj.values, sort=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mask = obj.isin(['b', 'c'])\n", + "mask" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj[mask]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame({'Qu1': [1, 3, 4, 3, 4],\n", + " 'Qu2': [2, 3, 1, 2, 3],\n", + " 'Qu3': [1, 5, 2, 4, 4]})\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = data.apply(pd.value_counts).fillna(0)\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Handling missing data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])\n", + "string_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "string_data.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "string_data[0] = None\n", + "string_data.isnull()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filtering out missing data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from numpy import nan as NA\n", + "data = Series([1, NA, 3.5, NA, 7])\n", + "data.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[data.notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame([[1., 6.5, 3.], [1., NA, NA],\n", + " [NA, NA, NA], [NA, 6.5, 3.]])\n", + "cleaned = data.dropna()\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cleaned" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.dropna(how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[4] = NA\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.dropna(axis=1, how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame(np.random.randn(7, 3))\n", + "df.ix[:4, 1] = NA; df.ix[:2, 2] = NA\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.dropna(thresh=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Filling in missing data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.fillna({1: 0.5, 3: -1})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# always returns a reference to the filled object\n", + "_ = df.fillna(0, inplace=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame(np.random.randn(6, 3))\n", + "df.ix[2:, 1] = NA; df.ix[4:, 2] = NA\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.fillna(method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.fillna(method='ffill', limit=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = Series([1., NA, 3.5, NA, 7])\n", + "data.fillna(data.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Hierarchical indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = Series(np.random.randn(10),\n", + " index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],\n", + " [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data['b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data['b':'c']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix[['b', 'd']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[:, 2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.unstack().stack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.arange(12).reshape((4, 3)),\n", + " index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\n", + " columns=[['Ohio', 'Ohio', 'Colorado'],\n", + " ['Green', 'Red', 'Green']])\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.index.names = ['key1', 'key2']\n", + "frame.columns.names = ['state', 'color']\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame['Ohio']" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],\n", + " names=['state', 'color'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reordering and sorting levels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.swaplevel('key1', 'key2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sortlevel(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.swaplevel(0, 1).sortlevel(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary statistics by level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sum(level='key2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.sum(level='color', axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using a DataFrame's columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),\n", + " 'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],\n", + " 'd': [0, 1, 2, 0, 1, 2, 3]})\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2 = frame.set_index(['c', 'd'])\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.set_index(['c', 'd'], drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame2.reset_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Other pandas topics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Integer indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ser = Series(np.arange(3.))\n", + "ser.iloc[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])\n", + "ser2[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ser.ix[:1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ser3 = Series(range(3), index=[-5, 1, 3])\n", + "ser3.iloc[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.arange(6).reshape((3, 2)), index=[2, 0, 1])\n", + "frame.iloc[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Panel data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas.io.data as web\n", + "\n", + "pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))\n", + " for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pdata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pdata = pdata.swapaxes('items', 'minor')\n", + "pdata['Adj Close']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pdata.ix[:, '6/1/2012', :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pdata.ix['Adj Close', '5/22/2012':, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stacked = pdata.ix[:, '5/30/2012':, :].to_frame()\n", + "stacked" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stacked.to_panel()" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:f87bf79b5f34c6e4082c1c7d147704bbe4384fca0f48ff5ebf12d45a79892467" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Getting started with pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas import Series, DataFrame\n", - "import pandas as pd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "np.random.seed(12345)\n", - "plt.rc('figure', figsize=(10, 6))\n", - "from pandas import Series, DataFrame\n", - "import pandas as pd\n", - "np.set_printoptions(precision=4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pwd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Introduction to pandas data structures" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series([4, 7, -5, 3])\n", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.values\n", - "obj.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2 = Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])\n", - "obj2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2['a']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2['d'] = 6\n", - "obj2[['c', 'a', 'd']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2[obj2 > 0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2 * 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.exp(obj2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'b' in obj2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'e' in obj2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n", - "obj3 = Series(sdata)\n", - "obj3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = ['California', 'Ohio', 'Oregon', 'Texas']\n", - "obj4 = Series(sdata, index=states)\n", - "obj4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.isnull(obj4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.notnull(obj4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj4.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj3 + obj4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj4.name = 'population'\n", - "obj4.index.name = 'state'\n", - "obj4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']\n", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "DataFrame" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],\n", - " 'year': [2000, 2001, 2002, 2001, 2002],\n", - " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9]}\n", - "frame = DataFrame(data)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "DataFrame(data, columns=['year', 'state', 'pop'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2 = DataFrame(data, columns=['year', 'state', 'pop', 'debt'],\n", - " index=['one', 'two', 'three', 'four', 'five'])\n", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.columns" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2['state']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.year" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.ix['three']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2['debt'] = 16.5\n", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2['debt'] = np.arange(5.)\n", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val = Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n", - "frame2['debt'] = val\n", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2['eastern'] = frame2.state == 'Ohio'\n", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "del frame2['eastern']\n", - "frame2.columns" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n", - " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3 = DataFrame(pop)\n", - "frame3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3.T" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "DataFrame(pop, index=[2001, 2002, 2003])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pdata = {'Ohio': frame3['Ohio'][:-1],\n", - " 'Nevada': frame3['Nevada'][:2]}\n", - "DataFrame(pdata)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3.index.name = 'year'; frame3.columns.name = 'state'\n", - "frame3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3.values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Index objects" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series(range(3), index=['a', 'b', 'c'])\n", - "index = obj.index\n", - "index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "index[1:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "index[1] = 'd'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "index = pd.Index(np.arange(3))\n", - "obj2 = Series([1.5, -2.5, 0], index=index)\n", - "obj2.index is index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'Ohio' in frame3.columns" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "2003 in frame3.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Essential functionality" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reindexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])\n", - "obj2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])\n", - "obj3.reindex(range(6), method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'c', 'd'],\n", - " columns=['Ohio', 'Texas', 'California'])\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = ['Texas', 'Utah', 'California']\n", - "frame.reindex(columns=states)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.reindex(index=['a', 'b', 'c', 'd'], method='ffill',\n", - " columns=states)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.ix[['a', 'b', 'c', 'd'], states]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Dropping entries from an axis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])\n", - "new_obj = obj.drop('c')\n", - "new_obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.drop(['d', 'c'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame(np.arange(16).reshape((4, 4)),\n", - " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", - " columns=['one', 'two', 'three', 'four'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop(['Colorado', 'Ohio'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop('two', axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop(['two', 'four'], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Indexing, selection, and filtering" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])\n", - "obj['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj[1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj[2:4]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj[['b', 'a', 'd']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj[[1, 3]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj[obj < 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj['b':'c']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj['b':'c'] = 5\n", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame(np.arange(16).reshape((4, 4)),\n", - " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", - " columns=['one', 'two', 'three', 'four'])\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['two']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[['three', 'one']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[:2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[data['three'] > 5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data < 5" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[data < 5] = 0" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix['Colorado', ['two', 'three']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix[['Colorado', 'Utah'], [3, 0, 1]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix[2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix[:'Utah', 'two']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix[data.three > 5, :3]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Arithmetic and data alignment" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 = Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n", - "s2 = Series([-2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 + s2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),\n", - " index=['Ohio', 'Texas', 'Colorado'])\n", - "df2 = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", - " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", - "df1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 + df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Arithmetic methods with fill values" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = DataFrame(np.arange(12.).reshape((3, 4)), columns=list('abcd'))\n", - "df2 = DataFrame(np.arange(20.).reshape((4, 5)), columns=list('abcde'))\n", - "df1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 + df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1.add(df2, fill_value=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1.reindex(columns=df2.columns, fill_value=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Operations between DataFrame and Series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(12.).reshape((3, 4))\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr - arr[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", - " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", - "series = frame.ix[0]\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "series" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame - series" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "series2 = Series(range(3), index=['b', 'e', 'f'])\n", - "frame + series2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "series3 = frame['d']\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "series3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sub(series3, axis=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Function application and mapping" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),\n", - " index=['Utah', 'Ohio', 'Texas', 'Oregon'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.abs(frame)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f = lambda x: x.max() - x.min()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.apply(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.apply(f, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def f(x):\n", - " return Series([x.min(), x.max()], index=['min', 'max'])\n", - "frame.apply(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "format = lambda x: '%.2f' % x\n", - "frame.applymap(format)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['e'].map(format)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Sorting and ranking" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series(range(4), index=['d', 'a', 'b', 'c'])\n", - "obj.sort_index()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.arange(8).reshape((2, 4)), index=['three', 'one'],\n", - " columns=['d', 'a', 'b', 'c'])\n", - "frame.sort_index()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sort_index(axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sort_index(axis=1, ascending=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series([4, 7, -3, 2])\n", - "obj.order()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series([4, np.nan, 7, np.nan, -3, 2])\n", - "obj.order()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sort_index(by='b')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sort_index(by=['a', 'b'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series([7, -5, 7, 4, 2, 0, 4])\n", - "obj.rank()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.rank(method='first')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.rank(ascending=False, method='max')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],\n", - " 'c': [-2, 5, 8, -2.5]})\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.rank(axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Axis indexes with duplicate values" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.index.is_unique" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj['a']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj['c']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.ix['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Summarizing and computing descriptive statistics" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame([[1.4, np.nan], [7.1, -4.5],\n", - " [np.nan, np.nan], [0.75, -1.3]],\n", - " index=['a', 'b', 'c', 'd'],\n", - " columns=['one', 'two'])\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.sum(axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.mean(axis=1, skipna=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.idxmax()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.cumsum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series(['a', 'a', 'b', 'c'] * 4)\n", - "obj.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Correlation and covariance" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas.io.data as web\n", - "\n", - "all_data = {}\n", - "for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:\n", - " all_data[ticker] = web.get_data_yahoo(ticker)\n", - "\n", - "price = DataFrame({tic: data['Adj Close']\n", - " for tic, data in all_data.iteritems()})\n", - "volume = DataFrame({tic: data['Volume']\n", - " for tic, data in all_data.iteritems()})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns = price.pct_change()\n", - "returns.tail()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.MSFT.corr(returns.IBM)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.MSFT.cov(returns.IBM)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.corr()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.cov()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.corrwith(returns.IBM)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.corrwith(volume)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Unique values, value counts, and membership" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "uniques = obj.unique()\n", - "uniques" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.value_counts(obj.values, sort=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mask = obj.isin(['b', 'c'])\n", - "mask" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj[mask]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame({'Qu1': [1, 3, 4, 3, 4],\n", - " 'Qu2': [2, 3, 1, 2, 3],\n", - " 'Qu3': [1, 5, 2, 4, 4]})\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = data.apply(pd.value_counts).fillna(0)\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Handling missing data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado'])\n", - "string_data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "string_data.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "string_data[0] = None\n", - "string_data.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Filtering out missing data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy import nan as NA\n", - "data = Series([1, NA, 3.5, NA, 7])\n", - "data.dropna()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[data.notnull()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame([[1., 6.5, 3.], [1., NA, NA],\n", - " [NA, NA, NA], [NA, 6.5, 3.]])\n", - "cleaned = data.dropna()\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cleaned" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.dropna(how='all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[4] = NA\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.dropna(axis=1, how='all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame(np.random.randn(7, 3))\n", - "df.ix[:4, 1] = NA; df.ix[:2, 2] = NA\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.dropna(thresh=3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Filling in missing data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.fillna(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.fillna({1: 0.5, 3: -1})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# always returns a reference to the filled object\n", - "_ = df.fillna(0, inplace=True)\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame(np.random.randn(6, 3))\n", - "df.ix[2:, 1] = NA; df.ix[4:, 2] = NA\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.fillna(method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.fillna(method='ffill', limit=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = Series([1., NA, 3.5, NA, 7])\n", - "data.fillna(data.mean())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Hierarchical indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = Series(np.random.randn(10),\n", - " index=[['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'd', 'd'],\n", - " [1, 2, 3, 1, 2, 3, 1, 2, 2, 3]])\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['b':'c']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix[['b', 'd']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[:, 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.unstack().stack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.arange(12).reshape((4, 3)),\n", - " index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\n", - " columns=[['Ohio', 'Ohio', 'Colorado'],\n", - " ['Green', 'Red', 'Green']])\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.index.names = ['key1', 'key2']\n", - "frame.columns.names = ['state', 'color']\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['Ohio']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],\n", - " names=['state', 'color'])" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reordering and sorting levels" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.swaplevel('key1', 'key2')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sortlevel(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.swaplevel(0, 1).sortlevel(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Summary statistics by level" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sum(level='key2')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sum(level='color', axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Using a DataFrame's columns" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame({'a': range(7), 'b': range(7, 0, -1),\n", - " 'c': ['one', 'one', 'one', 'two', 'two', 'two', 'two'],\n", - " 'd': [0, 1, 2, 0, 1, 2, 3]})\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2 = frame.set_index(['c', 'd'])\n", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.set_index(['c', 'd'], drop=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.reset_index()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Other pandas topics" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Integer indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser = Series(np.arange(3.))\n", - "ser.iloc[-1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser2 = Series(np.arange(3.), index=['a', 'b', 'c'])\n", - "ser2[-1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser.ix[:1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser3 = Series(range(3), index=[-5, 1, 3])\n", - "ser3.iloc[2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.arange(6).reshape((3, 2)), index=[2, 0, 1])\n", - "frame.iloc[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Panel data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas.io.data as web\n", - "\n", - "pdata = pd.Panel(dict((stk, web.get_data_yahoo(stk))\n", - " for stk in ['AAPL', 'GOOG', 'MSFT', 'DELL']))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pdata" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pdata = pdata.swapaxes('items', 'minor')\n", - "pdata['Adj Close']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pdata.ix[:, '6/1/2012', :]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pdata.ix['Adj Close', '5/22/2012':, :]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stacked = pdata.ix[:, '5/30/2012':, :].to_frame()\n", - "stacked" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stacked.to_panel()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 0aabc55ef1ebc391657d7e4e00999cac19673b82 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 28 Oct 2015 19:03:40 -0700 Subject: [PATCH 3/9] Run through ch06 --- ch06.ipynb | 1995 ++++++++++++++++++++++++---------------------- ch06/tseries.csv | 14 +- 2 files changed, 1036 insertions(+), 973 deletions(-) diff --git a/ch06.ipynb b/ch06.ipynb index 49d30209c..eef10deac 100644 --- a/ch06.ipynb +++ b/ch06.ipynb @@ -1,969 +1,1032 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data loading, storage, and file formats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "import os\n", + "import sys\n", + "import matplotlib.pyplot as plt\n", + "np.random.seed(12345)\n", + "plt.rc('figure', figsize=(10, 6))\n", + "from pandas import Series, DataFrame\n", + "import pandas as pd\n", + "np.set_printoptions(precision=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pwd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reading and Writing Data in Text Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!cat ch06/ex1.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = pd.read_csv('ch06/ex1.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.read_table('ch06/ex1.csv', sep=',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!cat ch06/ex2.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.read_csv('ch06/ex2.csv', header=None)\n", + "pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names = ['a', 'b', 'c', 'd', 'message']\n", + "pd.read_csv('ch06/ex2.csv', names=names, index_col='message')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!cat ch06/csv_mindex.csv\n", + "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])\n", + "parsed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "list(open('ch06/ex3.txt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = pd.read_table('ch06/ex3.txt', sep='\\s+')\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!cat ch06/ex4.csv\n", + "pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!cat ch06/ex5.csv\n", + "result = pd.read_csv('ch06/ex5.csv')\n", + "result\n", + "pd.isnull(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n", + "pd.read_csv('ch06/ex5.csv', na_values=sentinels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reading text files in pieces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = pd.read_csv('ch06/ex6.csv')\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.read_csv('ch06/ex6.csv', nrows=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", + "chunker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", + "\n", + "tot = Series([])\n", + "for piece in chunker:\n", + " tot = tot.add(piece['key'].value_counts(), fill_value=0)\n", + "\n", + "tot = tot.order(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tot[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Writing data out to text format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = pd.read_csv('ch06/ex5.csv')\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.to_csv('ch06/out.csv')\n", + "!cat ch06/out.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.to_csv(sys.stdout, sep='|')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.to_csv(sys.stdout, na_rep='NULL')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.to_csv(sys.stdout, index=False, header=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dates = pd.date_range('1/1/2000', periods=7)\n", + "ts = Series(np.arange(7), index=dates)\n", + "ts.to_csv('ch06/tseries.csv')\n", + "!cat ch06/tseries.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "Series.from_csv('ch06/tseries.csv', parse_dates=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Manually working with delimited formats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!cat ch06/ex7.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import csv\n", + "f = open('ch06/ex7.csv')\n", + "\n", + "reader = csv.reader(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for line in reader:\n", + " print(line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "lines = list(csv.reader(open('ch06/ex7.csv')))\n", + "header, values = lines[0], lines[1:]\n", + "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", + "data_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "class my_dialect(csv.Dialect):\n", + " lineterminator = '\\n'\n", + " delimiter = ';'\n", + " quotechar = '\"'\n", + " quoting = csv.QUOTE_MINIMAL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "with open('mydata.csv', 'w') as f:\n", + " writer = csv.writer(f, dialect=my_dialect)\n", + " writer.writerow(('one', 'two', 'three'))\n", + " writer.writerow(('1', '2', '3'))\n", + " writer.writerow(('4', '5', '6'))\n", + " writer.writerow(('7', '8', '9'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%cat mydata.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### JSON data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "obj = \"\"\"\n", + "{\"name\": \"Wes\",\n", + " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", + " \"pet\": null,\n", + " \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n", + " {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n", + "}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import json\n", + "result = json.loads(obj)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "asjson = json.dumps(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "siblings = DataFrame(result['siblings'], columns=['name', 'age'])\n", + "siblings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### XML and HTML, Web scraping\n", + "\n", + "**NB. The Yahoo! Finance API has changed and this example no longer works**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from lxml.html import parse\n", + "from urllib2 import urlopen\n", + "\n", + "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n", + "\n", + "doc = parsed.getroot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "links = doc.findall('.//a')\n", + "links[15:20]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "lnk = links[28]\n", + "lnk\n", + "lnk.get('href')\n", + "lnk.text_content()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "urls = [lnk.get('href') for lnk in doc.findall('.//a')]\n", + "urls[-10:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tables = doc.findall('.//table')\n", + "calls = tables[9]\n", + "puts = tables[13]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rows = calls.findall('.//tr')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def _unpack(row, kind='td'):\n", + " elts = row.findall('.//%s' % kind)\n", + " return [val.text_content() for val in elts]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "_unpack(rows[0], kind='th')\n", + "_unpack(rows[1], kind='td')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas.io.parsers import TextParser\n", + "\n", + "def parse_options_data(table):\n", + " rows = table.findall('.//tr')\n", + " header = _unpack(rows[0], kind='th')\n", + " data = [_unpack(r) for r in rows[1:]]\n", + " return TextParser(data, names=header).get_chunk()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "call_data = parse_options_data(calls)\n", + "put_data = parse_options_data(puts)\n", + "call_data[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Parsing XML with lxml.objectify" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%cd ch06/mta_perf/Performance_XML_Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "!head -21 Performance_MNR.xml" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from lxml import objectify\n", + "\n", + "path = 'Performance_MNR.xml'\n", + "parsed = objectify.parse(open(path))\n", + "root = parsed.getroot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = []\n", + "\n", + "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", + " 'DESIRED_CHANGE', 'DECIMAL_PLACES']\n", + "\n", + "for elt in root.INDICATOR:\n", + " el_data = {}\n", + " for child in elt.getchildren():\n", + " if child.tag in skip_fields:\n", + " continue\n", + " el_data[child.tag] = child.pyval\n", + " data.append(el_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "perf = DataFrame(data)\n", + "perf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "root" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "root.get('href')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "root.text" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Binary data formats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cd ../.." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = pd.read_csv('ch06/ex1.csv')\n", + "frame\n", + "frame.to_pickle('ch06/frame_pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.read_pickle('ch06/frame_pickle')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using HDF5 format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "store = pd.HDFStore('mydata.h5')\n", + "store['obj1'] = frame\n", + "store['obj1_col'] = frame['a']\n", + "store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "store['obj1']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "store.close()\n", + "os.remove('mydata.h5')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Interacting with HTML and Web APIs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import requests\n", + "url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'\n", + "resp = requests.get(url)\n", + "resp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "issue_labels = DataFrame(data)\n", + "issue_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Interacting with databases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import sqlite3\n", + "\n", + "query = \"\"\"\n", + "CREATE TABLE test\n", + "(a VARCHAR(20), b VARCHAR(20),\n", + " c REAL, d INTEGER\n", + ");\"\"\"\n", + "\n", + "con = sqlite3.connect(':memory:')\n", + "con.execute(query)\n", + "con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = [('Atlanta', 'Georgia', 1.25, 6),\n", + " ('Tallahassee', 'Florida', 2.6, 3),\n", + " ('Sacramento', 'California', 1.7, 5)]\n", + "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", + "\n", + "con.executemany(stmt, data)\n", + "con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cursor = con.execute('select * from test')\n", + "rows = cursor.fetchall()\n", + "rows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cursor.description" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "DataFrame(rows, columns=zip(*cursor.description)[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas.io.sql as sql\n", + "sql.read_sql('select * from test', con)" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:33b58f63005d8bb0bc3878707d8ea8a04902b6379eca42e5e6c26680a530c487" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Data loading, storage, and file formats" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "np.random.seed(12345)\n", - "plt.rc('figure', figsize=(10, 6))\n", - "from pandas import Series, DataFrame\n", - "import pandas as pd\n", - "np.set_printoptions(precision=4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pwd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Reading and Writing Data in Text Format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat ch06/ex1.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.read_csv('ch06/ex1.csv')\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_table('ch06/ex1.csv', sep=',')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat ch06/ex2.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_csv('ch06/ex2.csv', header=None)\n", - "pd.read_csv('ch06/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names = ['a', 'b', 'c', 'd', 'message']\n", - "pd.read_csv('ch06/ex2.csv', names=names, index_col='message')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat ch06/csv_mindex.csv\n", - "parsed = pd.read_csv('ch06/csv_mindex.csv', index_col=['key1', 'key2'])\n", - "parsed" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "list(open('ch06/ex3.txt'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.read_table('ch06/ex3.txt', sep='\\s+')\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat ch06/ex4.csv\n", - "pd.read_csv('ch06/ex4.csv', skiprows=[0, 2, 3])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat ch06/ex5.csv\n", - "result = pd.read_csv('ch06/ex5.csv')\n", - "result\n", - "pd.isnull(result)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.read_csv('ch06/ex5.csv', na_values=['NULL'])\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n", - "pd.read_csv('ch06/ex5.csv', na_values=sentinels)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reading text files in pieces" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.read_csv('ch06/ex6.csv')\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_csv('ch06/ex6.csv', nrows=5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", - "chunker" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "chunker = pd.read_csv('ch06/ex6.csv', chunksize=1000)\n", - "\n", - "tot = Series([])\n", - "for piece in chunker:\n", - " tot = tot.add(piece['key'].value_counts(), fill_value=0)\n", - "\n", - "tot = tot.order(ascending=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tot[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Writing data out to text format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_csv('ch06/ex5.csv')\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv('ch06/out.csv')\n", - "!cat ch06/out.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv(sys.stdout, sep='|')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv(sys.stdout, na_rep='NULL')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv(sys.stdout, index=False, header=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dates = pd.date_range('1/1/2000', periods=7)\n", - "ts = Series(np.arange(7), index=dates)\n", - "ts.to_csv('ch06/tseries.csv')\n", - "!cat ch06/tseries.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "Series.from_csv('ch06/tseries.csv', parse_dates=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Manually working with delimited formats" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat ch06/ex7.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import csv\n", - "f = open('ch06/ex7.csv')\n", - "\n", - "reader = csv.reader(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for line in reader:\n", - " print(line)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "lines = list(csv.reader(open('ch06/ex7.csv')))\n", - "header, values = lines[0], lines[1:]\n", - "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", - "data_dict" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "class my_dialect(csv.Dialect):\n", - " lineterminator = '\\n'\n", - " delimiter = ';'\n", - " quotechar = '\"'\n", - " quoting = csv.QUOTE_MINIMAL" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "with open('mydata.csv', 'w') as f:\n", - " writer = csv.writer(f, dialect=my_dialect)\n", - " writer.writerow(('one', 'two', 'three'))\n", - " writer.writerow(('1', '2', '3'))\n", - " writer.writerow(('4', '5', '6'))\n", - " writer.writerow(('7', '8', '9'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cat mydata.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "JSON data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = \"\"\"\n", - "{\"name\": \"Wes\",\n", - " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", - " \"pet\": null,\n", - " \"siblings\": [{\"name\": \"Scott\", \"age\": 25, \"pet\": \"Zuko\"},\n", - " {\"name\": \"Katie\", \"age\": 33, \"pet\": \"Cisco\"}]\n", - "}\n", - "\"\"\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import json\n", - "result = json.loads(obj)\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "asjson = json.dumps(result)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "siblings = DataFrame(result['siblings'], columns=['name', 'age'])\n", - "siblings" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "XML and HTML, Web scraping" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from lxml.html import parse\n", - "from urllib2 import urlopen\n", - "\n", - "parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))\n", - "\n", - "doc = parsed.getroot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "links = doc.findall('.//a')\n", - "links[15:20]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "lnk = links[28]\n", - "lnk\n", - "lnk.get('href')\n", - "lnk.text_content()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "urls = [lnk.get('href') for lnk in doc.findall('.//a')]\n", - "urls[-10:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tables = doc.findall('.//table')\n", - "calls = tables[9]\n", - "puts = tables[13]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rows = calls.findall('.//tr')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def _unpack(row, kind='td'):\n", - " elts = row.findall('.//%s' % kind)\n", - " return [val.text_content() for val in elts]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "_unpack(rows[0], kind='th')\n", - "_unpack(rows[1], kind='td')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas.io.parsers import TextParser\n", - "\n", - "def parse_options_data(table):\n", - " rows = table.findall('.//tr')\n", - " header = _unpack(rows[0], kind='th')\n", - " data = [_unpack(r) for r in rows[1:]]\n", - " return TextParser(data, names=header).get_chunk()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "call_data = parse_options_data(calls)\n", - "put_data = parse_options_data(puts)\n", - "call_data[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Parsing XML with lxml.objectify" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd mta_perf/Performance_XML_Data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!head -21 Performance_MNR.xml" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from lxml import objectify\n", - "\n", - "path = 'Performance_MNR.xml'\n", - "parsed = objectify.parse(open(path))\n", - "root = parsed.getroot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = []\n", - "\n", - "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", - " 'DESIRED_CHANGE', 'DECIMAL_PLACES']\n", - "\n", - "for elt in root.INDICATOR:\n", - " el_data = {}\n", - " for child in elt.getchildren():\n", - " if child.tag in skip_fields:\n", - " continue\n", - " el_data[child.tag] = child.pyval\n", - " data.append(el_data)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "perf = DataFrame(data)\n", - "perf" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "root" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "root.get('href')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": true, - "input": [ - "root.text" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Binary data formats" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cd ../.." - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.read_csv('ch06/ex1.csv')\n", - "frame\n", - "frame.to_pickle('ch06/frame_pickle')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_pickle('ch06/frame_pickle')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Using HDF5 format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "store = pd.HDFStore('mydata.h5')\n", - "store['obj1'] = frame\n", - "store['obj1_col'] = frame['a']\n", - "store" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "store['obj1']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "store.close()\n", - "os.remove('mydata.h5')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Interacting with HTML and Web APIs" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import requests\n", - "url = 'https://api.github.com/repos/pydata/pandas/milestones/28/labels'\n", - "resp = requests.get(url)\n", - "resp" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "issue_labels = DataFrame(data)\n", - "issue_labels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Interacting with databases" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import sqlite3\n", - "\n", - "query = \"\"\"\n", - "CREATE TABLE test\n", - "(a VARCHAR(20), b VARCHAR(20),\n", - " c REAL, d INTEGER\n", - ");\"\"\"\n", - "\n", - "con = sqlite3.connect(':memory:')\n", - "con.execute(query)\n", - "con.commit()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = [('Atlanta', 'Georgia', 1.25, 6),\n", - " ('Tallahassee', 'Florida', 2.6, 3),\n", - " ('Sacramento', 'California', 1.7, 5)]\n", - "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", - "\n", - "con.executemany(stmt, data)\n", - "con.commit()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cursor = con.execute('select * from test')\n", - "rows = cursor.fetchall()\n", - "rows" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cursor.description" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "DataFrame(rows, columns=zip(*cursor.description)[0])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas.io.sql as sql\n", - "sql.read_sql('select * from test', con)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch06/tseries.csv b/ch06/tseries.csv index 17e25dcf0..d7203db1b 100644 --- a/ch06/tseries.csv +++ b/ch06/tseries.csv @@ -1,7 +1,7 @@ -2000-01-01 00:00:00,0 -2000-01-02 00:00:00,1 -2000-01-03 00:00:00,2 -2000-01-04 00:00:00,3 -2000-01-05 00:00:00,4 -2000-01-06 00:00:00,5 -2000-01-07 00:00:00,6 +2000-01-01,0 +2000-01-02,1 +2000-01-03,2 +2000-01-04,3 +2000-01-05,4 +2000-01-06,5 +2000-01-07,6 From 33d9310180ef351c5632180b4c5359b7cc4b5d4e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 28 Oct 2015 19:06:18 -0700 Subject: [PATCH 4/9] Run through ch07 and fix issues --- ch07.ipynb | 4742 +++++++++++++++++++++++++++------------------------- 1 file changed, 2453 insertions(+), 2289 deletions(-) diff --git a/ch07.ipynb b/ch07.ipynb index 4cbcc731a..8d4a9637e 100644 --- a/ch07.ipynb +++ b/ch07.ipynb @@ -1,2292 +1,2456 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Wrangling: Clean, Transform, Merge, Reshape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "np.random.seed(12345)\n", + "plt.rc('figure', figsize=(10, 6))\n", + "from pandas import Series, DataFrame\n", + "import pandas\n", + "import pandas as pd\n", + "np.set_printoptions(precision=4, threshold=500)\n", + "pd.options.display.max_rows = 100" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Combining and merging data sets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Database-style DataFrame merges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],\n", + " 'data1': range(7)})\n", + "df2 = DataFrame({'key': ['a', 'b', 'd'],\n", + " 'data2': range(3)})\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2, on='key')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],\n", + " 'data1': range(7)})\n", + "df4 = DataFrame({'rkey': ['a', 'b', 'd'],\n", + " 'data2': range(3)})\n", + "pd.merge(df3, df4, left_on='lkey', right_on='rkey')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", + " 'data1': range(6)})\n", + "df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],\n", + " 'data2': range(5)})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2, on='key', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2, how='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left = DataFrame({'key1': ['foo', 'foo', 'bar'],\n", + " 'key2': ['one', 'two', 'one'],\n", + " 'lval': [1, 2, 3]})\n", + "right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],\n", + " 'key2': ['one', 'one', 'one', 'two'],\n", + " 'rval': [4, 5, 6, 7]})\n", + "pd.merge(left, right, on=['key1', 'key2'], how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(left, right, on='key1')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(left, right, on='key1', suffixes=('_left', '_right'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Merging on index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],\n", + " 'value': range(6)})\n", + "right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "right1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(left1, right1, left_on='key', right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(left1, right1, left_on='key', right_index=True, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],\n", + " 'key2': [2000, 2001, 2002, 2001, 2002],\n", + " 'data': np.arange(5.)})\n", + "righth = DataFrame(np.arange(12).reshape((6, 2)),\n", + " index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],\n", + " [2001, 2000, 2000, 2000, 2001, 2002]],\n", + " columns=['event1', 'event2'])\n", + "lefth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "righth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(lefth, righth, left_on=['key1', 'key2'],\n", + " right_index=True, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],\n", + " columns=['Ohio', 'Nevada'])\n", + "right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],\n", + " index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "right2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(left2, right2, how='outer', left_index=True, right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left2.join(right2, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left1.join(right1, on='key')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],\n", + " index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left2.join([right2, another])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "left2.join([right2, another], how='outer')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Concatenating along an axis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(12).reshape((3, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.concatenate([arr, arr], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s1 = Series([0, 1], index=['a', 'b'])\n", + "s2 = Series([2, 3, 4], index=['c', 'd', 'e'])\n", + "s3 = Series([5, 6], index=['f', 'g'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([s1, s2, s3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([s1, s2, s3], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s4 = pd.concat([s1 * 5, s3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([s1, s4], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([s1, s4], axis=1, join='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Much more on the unstack function later\n", + "result.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],\n", + " columns=['one', 'two'])\n", + "df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],\n", + " columns=['three', 'four'])\n", + "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat({'level1': df1, 'level2': df2}, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],\n", + " names=['upper', 'lower'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])\n", + "df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.concat([df1, df2], ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Combining data with overlap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],\n", + " index=['f', 'e', 'd', 'c', 'b', 'a'])\n", + "b = Series(np.arange(len(a), dtype=np.float64),\n", + " index=['f', 'e', 'd', 'c', 'b', 'a'])\n", + "b[-1] = np.nan" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.where(pd.isnull(a), b, a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "b[:-2].combine_first(a[2:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df1 = DataFrame({'a': [1., np.nan, 5., np.nan],\n", + " 'b': [np.nan, 2., np.nan, 6.],\n", + " 'c': range(2, 18, 4)})\n", + "df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],\n", + " 'b': [np.nan, 3., 4., 6., 8.]})\n", + "df1.combine_first(df2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reshaping and pivoting" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reshaping with hierarchical indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame(np.arange(6).reshape((2, 3)),\n", + " index=pd.Index(['Ohio', 'Colorado'], name='state'),\n", + " columns=pd.Index(['one', 'two', 'three'], name='number'))\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = data.stack()\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result.unstack(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result.unstack('state')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])\n", + "s2 = Series([4, 5, 6], index=['c', 'd', 'e'])\n", + "data2 = pd.concat([s1, s2], keys=['one', 'two'])\n", + "data2.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data2.unstack().stack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data2.unstack().stack(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame({'left': result, 'right': result + 5},\n", + " columns=pd.Index(['left', 'right'], name='side'))\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.unstack('state')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.unstack('state').stack('side')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Pivoting \"long\" to \"wide\" format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = pd.read_csv('ch07/macrodata.csv')\n", + "periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')\n", + "data = DataFrame(data.to_records(),\n", + " columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),\n", + " index=periods.to_timestamp('D', 'end'))\n", + "\n", + "ldata = data.stack().reset_index().rename(columns={0: 'value'})\n", + "wdata = ldata.pivot('date', 'item', 'value')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ldata[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pivoted = ldata.pivot('date', 'item', 'value')\n", + "pivoted.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ldata['value2'] = np.random.randn(len(ldata))\n", + "ldata[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pivoted = ldata.pivot('date', 'item')\n", + "pivoted[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pivoted['value'][:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "unstacked = ldata.set_index(['date', 'item']).unstack('item')\n", + "unstacked[:7]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data transformation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Removing duplicates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,\n", + " 'k2': [1, 1, 2, 3, 3, 4, 4]})\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.duplicated()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data['v1'] = range(7)\n", + "data.drop_duplicates(['k1'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.drop_duplicates(['k1', 'k2'], take_last=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transforming data using a function or mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',\n", + " 'corned beef', 'Bacon', 'pastrami', 'honey ham',\n", + " 'nova lox'],\n", + " 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "meat_to_animal = {\n", + " 'bacon': 'pig',\n", + " 'pulled pork': 'pig',\n", + " 'pastrami': 'cow',\n", + " 'corned beef': 'cow',\n", + " 'honey ham': 'pig',\n", + " 'nova lox': 'salmon'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data['animal'] = data['food'].map(str.lower).map(meat_to_animal)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data['food'].map(lambda x: meat_to_animal[x.lower()])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Replacing values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = Series([1., -999., 2., -999., -1000., 3.])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.replace(-999, np.nan)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.replace([-999, -1000], np.nan)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.replace([-999, -1000], [np.nan, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.replace({-999: np.nan, -1000: 0})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Renaming axis indexes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = DataFrame(np.arange(12).reshape((3, 4)),\n", + " index=['Ohio', 'Colorado', 'New York'],\n", + " columns=['one', 'two', 'three', 'four'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.index.map(str.upper)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.index = data.index.map(str.upper)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.rename(index=str.title, columns=str.upper)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.rename(index={'OHIO': 'INDIANA'},\n", + " columns={'three': 'peekaboo'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Always returns a reference to a DataFrame\n", + "_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Discretization and binning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bins = [18, 25, 35, 60, 100]\n", + "cats = pd.cut(ages, bins)\n", + "cats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cats.labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cats.levels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.value_counts(cats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.cut(ages, [18, 26, 36, 61, 100], right=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']\n", + "pd.cut(ages, bins, labels=group_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = np.random.rand(20)\n", + "pd.cut(data, 4, precision=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = np.random.randn(1000) # Normally distributed\n", + "cats = pd.qcut(data, 4) # Cut into quartiles\n", + "cats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.value_counts(cats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Detecting and filtering outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.random.seed(12345)\n", + "data = DataFrame(np.random.randn(1000, 4))\n", + "data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "col = data[3]\n", + "col[np.abs(col) > 3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[(np.abs(data) > 3).any(1)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[np.abs(data) > 3] = np.sign(data) * 3\n", + "data.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Permutation and random sampling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame(np.arange(5 * 4).reshape((5, 4)))\n", + "sampler = np.random.permutation(5)\n", + "sampler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.take(sampler)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.take(np.random.permutation(len(df))[:3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bag = np.array([5, 7, -1, 6, 4])\n", + "sampler = np.random.randint(0, len(bag), size=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sampler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "draws = bag.take(sampler)\n", + "draws" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computing indicator / dummy variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", + " 'data1': range(6)})\n", + "pd.get_dummies(df['key'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dummies = pd.get_dummies(df['key'], prefix='key')\n", + "df_with_dummy = df[['data1']].join(dummies)\n", + "df_with_dummy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mnames = ['movie_id', 'title', 'genres']\n", + "movies = pd.read_table('ch02/movielens/movies.dat', sep='::', header=None,\n", + " names=mnames)\n", + "movies[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "genre_iter = (set(x.split('|')) for x in movies.genres)\n", + "genres = sorted(set.union(*genre_iter))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for i, gen in enumerate(movies.genres):\n", + " dummies.ix[i, gen.split('|')] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "movies_windic = movies.join(dummies.add_prefix('Genre_'))\n", + "movies_windic.ix[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.random.seed(12345)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "values = np.random.rand(10)\n", + "values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n", + "pd.get_dummies(pd.cut(values, bins))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## String manipulation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### String object methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val = 'a,b, guido'\n", + "val.split(',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pieces = [x.strip() for x in val.split(',')]\n", + "pieces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "first, second, third = pieces\n", + "first + '::' + second + '::' + third" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "'::'.join(pieces)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "'guido' in val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val.index(',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val.find(':')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val.index(':')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val.count(',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val.replace(',', '::')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "val.replace(',', '')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Regular expressions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import re\n", + "text = \"foo bar\\t baz \\tqux\"\n", + "re.split('\\s+', text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "regex = re.compile('\\s+')\n", + "regex.split(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "regex.findall(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "text = \"\"\"Dave dave@google.com\n", + "Steve steve@gmail.com\n", + "Rob rob@gmail.com\n", + "Ryan ryan@yahoo.com\n", + "\"\"\"\n", + "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'\n", + "\n", + "# re.IGNORECASE makes the regex case-insensitive\n", + "regex = re.compile(pattern, flags=re.IGNORECASE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "regex.findall(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "m = regex.search(text)\n", + "m" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "text[m.start():m.end()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(regex.match(text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(regex.sub('REDACTED', text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'\n", + "regex = re.compile(pattern, flags=re.IGNORECASE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "m = regex.match('wesm@bright.net')\n", + "m.groups()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "regex.findall(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(regex.sub(r'Username: \\1, Domain: \\2, Suffix: \\3', text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "regex = re.compile(r\"\"\"\n", + " (?P[A-Z0-9._%+-]+)\n", + " @\n", + " (?P[A-Z0-9.-]+)\n", + " \\.\n", + " (?P[A-Z]{2,4})\"\"\", flags=re.IGNORECASE|re.VERBOSE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "m = regex.match('wesm@bright.net')\n", + "m.groupdict()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Vectorized string functions in pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',\n", + " 'Rob': 'rob@gmail.com', 'Wes': np.nan}\n", + "data = Series(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.str.contains('gmail')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pattern" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.str.findall(pattern, flags=re.IGNORECASE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "matches = data.str.match(pattern, flags=re.IGNORECASE)\n", + "matches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "matches.str.get(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "matches.str[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.str[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example: USDA Food Database" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "{\n", + " \"id\": 21441,\n", + " \"description\": \"KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,\n", + "Wing, meat and skin with breading\",\n", + " \"tags\": [\"KFC\"],\n", + " \"manufacturer\": \"Kentucky Fried Chicken\",\n", + " \"group\": \"Fast Foods\",\n", + " \"portions\": [\n", + " {\n", + " \"amount\": 1,\n", + " \"unit\": \"wing, with skin\",\n", + " \"grams\": 68.0\n", + " },\n", + "\n", + " ...\n", + " ],\n", + " \"nutrients\": [\n", + " {\n", + " \"value\": 20.8,\n", + " \"units\": \"g\",\n", + " \"description\": \"Protein\",\n", + " \"group\": \"Composition\"\n", + " },\n", + "\n", + " ...\n", + " ]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import json\n", + "db = json.load(open('ch07/foods-2011-10-03.json'))\n", + "len(db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "db[0].keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "db[0]['nutrients'][0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nutrients = DataFrame(db[0]['nutrients'])\n", + "nutrients[:7]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "info_keys = ['description', 'group', 'id', 'manufacturer']\n", + "info = DataFrame(db, columns=info_keys)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "info[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.value_counts(info.group)[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nutrients = []\n", + "\n", + "for rec in db:\n", + " fnuts = DataFrame(rec['nutrients'])\n", + " fnuts['id'] = rec['id']\n", + " nutrients.append(fnuts)\n", + "\n", + "nutrients = pd.concat(nutrients, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nutrients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nutrients.duplicated().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "nutrients = nutrients.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "col_mapping = {'description' : 'food',\n", + " 'group' : 'fgroup'}\n", + "info = info.rename(columns=col_mapping, copy=False)\n", + "info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "col_mapping = {'description' : 'nutrient',\n", + " 'group' : 'nutgroup'}\n", + "nutrients = nutrients.rename(columns=col_mapping, copy=False)\n", + "nutrients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ndata = pd.merge(nutrients, info, on='id', how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ndata" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ndata.ix[30000]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)\n", + "result['Zinc, Zn'].order().plot(kind='barh')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])\n", + "\n", + "get_maximum = lambda x: x.xs(x.value.idxmax())\n", + "get_minimum = lambda x: x.xs(x.value.idxmin())\n", + "\n", + "max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]\n", + "\n", + "# make the food a little smaller\n", + "max_foods.food = max_foods.food.str[:50]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "max_foods.ix['Amino Acids']['food']" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:9b3c55b1214330b9560e36ed5ace2e2bd26f9c642589d6c53379f139bc98862d" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data Wrangling: Clean, Transform, Merge, Reshape" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "np.random.seed(12345)\n", - "plt.rc('figure', figsize=(10, 6))\n", - "from pandas import Series, DataFrame\n", - "import pandas\n", - "import pandas as pd\n", - "np.set_printoptions(precision=4, threshold=500)\n", - "pd.options.display.max_rows = 100" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%matplotlib inline" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Combining and merging data sets" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Database-style DataFrame merges" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],\n", - " 'data1': range(7)})\n", - "df2 = DataFrame({'key': ['a', 'b', 'd'],\n", - " 'data2': range(3)})\n", - "df1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2, on='key')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df3 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],\n", - " 'data1': range(7)})\n", - "df4 = DataFrame({'rkey': ['a', 'b', 'd'],\n", - " 'data2': range(3)})\n", - "pd.merge(df3, df4, left_on='lkey', right_on='rkey')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", - " 'data1': range(6)})\n", - "df2 = DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],\n", - " 'data2': range(5)})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2, on='key', how='left')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2, how='inner')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left = DataFrame({'key1': ['foo', 'foo', 'bar'],\n", - " 'key2': ['one', 'two', 'one'],\n", - " 'lval': [1, 2, 3]})\n", - "right = DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],\n", - " 'key2': ['one', 'one', 'one', 'two'],\n", - " 'rval': [4, 5, 6, 7]})\n", - "pd.merge(left, right, on=['key1', 'key2'], how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(left, right, on='key1')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": true, - "input": [ - "pd.merge(left, right, on='key1', suffixes=('_left', '_right'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Merging on index" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left1 = DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],\n", - " 'value': range(6)})\n", - "right1 = DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "right1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(left1, right1, left_on='key', right_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(left1, right1, left_on='key', right_index=True, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "lefth = DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],\n", - " 'key2': [2000, 2001, 2002, 2001, 2002],\n", - " 'data': np.arange(5.)})\n", - "righth = DataFrame(np.arange(12).reshape((6, 2)),\n", - " index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],\n", - " [2001, 2000, 2000, 2000, 2001, 2002]],\n", - " columns=['event1', 'event2'])\n", - "lefth" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "righth" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(lefth, righth, left_on=['key1', 'key2'],\n", - " right_index=True, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left2 = DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'],\n", - " columns=['Ohio', 'Nevada'])\n", - "right2 = DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],\n", - " index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "right2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(left2, right2, how='outer', left_index=True, right_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left2.join(right2, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left1.join(right1, on='key')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "another = DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],\n", - " index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left2.join([right2, another])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left2.join([right2, another], how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Concatenating along an axis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(12).reshape((3, 4))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.concatenate([arr, arr], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 = Series([0, 1], index=['a', 'b'])\n", - "s2 = Series([2, 3, 4], index=['c', 'd', 'e'])\n", - "s3 = Series([5, 6], index=['f', 'g'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s2, s3])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s2, s3], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s4 = pd.concat([s1 * 5, s3])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s4], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s4], axis=1, join='inner')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Much more on the unstack function later\n", - "result.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],\n", - " columns=['one', 'two'])\n", - "df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],\n", - " columns=['three', 'four'])\n", - "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat({'level1': df1, 'level2': df2}, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],\n", - " names=['upper', 'lower'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])\n", - "df2 = DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([df1, df2], ignore_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Combining data with overlap" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],\n", - " index=['f', 'e', 'd', 'c', 'b', 'a'])\n", - "b = Series(np.arange(len(a), dtype=np.float64),\n", - " index=['f', 'e', 'd', 'c', 'b', 'a'])\n", - "b[-1] = np.nan" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.where(pd.isnull(a), b, a)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b[:-2].combine_first(a[2:])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = DataFrame({'a': [1., np.nan, 5., np.nan],\n", - " 'b': [np.nan, 2., np.nan, 6.],\n", - " 'c': range(2, 18, 4)})\n", - "df2 = DataFrame({'a': [5., 4., np.nan, 3., 7.],\n", - " 'b': [np.nan, 3., 4., 6., 8.]})\n", - "df1.combine_first(df2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Reshaping and pivoting" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reshaping with hierarchical indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame(np.arange(6).reshape((2, 3)),\n", - " index=pd.Index(['Ohio', 'Colorado'], name='state'),\n", - " columns=pd.Index(['one', 'two', 'three'], name='number'))\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = data.stack()\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result.unstack(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result.unstack('state')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])\n", - "s2 = Series([4, 5, 6], index=['c', 'd', 'e'])\n", - "data2 = pd.concat([s1, s2], keys=['one', 'two'])\n", - "data2.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data2.unstack().stack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data2.unstack().stack(dropna=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame({'left': result, 'right': result + 5},\n", - " columns=pd.Index(['left', 'right'], name='side'))\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.unstack('state')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.unstack('state').stack('side')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Pivoting \"long\" to \"wide\" format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_csv('ch07/macrodata.csv')\n", - "periods = pd.PeriodIndex(year=data.year, quarter=data.quarter, name='date')\n", - "data = DataFrame(data.to_records(),\n", - " columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'),\n", - " index=periods.to_timestamp('D', 'end'))\n", - "\n", - "ldata = data.stack().reset_index().rename(columns={0: 'value'})\n", - "wdata = ldata.pivot('date', 'item', 'value')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ldata[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pivoted = ldata.pivot('date', 'item', 'value')\n", - "pivoted.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ldata['value2'] = np.random.randn(len(ldata))\n", - "ldata[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pivoted = ldata.pivot('date', 'item')\n", - "pivoted[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pivoted['value'][:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "unstacked = ldata.set_index(['date', 'item']).unstack('item')\n", - "unstacked[:7]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data transformation" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Removing duplicates" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,\n", - " 'k2': [1, 1, 2, 3, 3, 4, 4]})\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.duplicated()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop_duplicates()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['v1'] = range(7)\n", - "data.drop_duplicates(['k1'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop_duplicates(['k1', 'k2'], take_last=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Transforming data using a function or mapping" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami',\n", - " 'corned beef', 'Bacon', 'pastrami', 'honey ham',\n", - " 'nova lox'],\n", - " 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "meat_to_animal = {\n", - " 'bacon': 'pig',\n", - " 'pulled pork': 'pig',\n", - " 'pastrami': 'cow',\n", - " 'corned beef': 'cow',\n", - " 'honey ham': 'pig',\n", - " 'nova lox': 'salmon'\n", - "}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['animal'] = data['food'].map(str.lower).map(meat_to_animal)\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['food'].map(lambda x: meat_to_animal[x.lower()])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Replacing values" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = Series([1., -999., 2., -999., -1000., 3.])\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace(-999, np.nan)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace([-999, -1000], np.nan)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace([-999, -1000], [np.nan, 0])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace({-999: np.nan, -1000: 0})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Renaming axis indexes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = DataFrame(np.arange(12).reshape((3, 4)),\n", - " index=['Ohio', 'Colorado', 'New York'],\n", - " columns=['one', 'two', 'three', 'four'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.index.map(str.upper)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.index = data.index.map(str.upper)\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.rename(index=str.title, columns=str.upper)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.rename(index={'OHIO': 'INDIANA'},\n", - " columns={'three': 'peekaboo'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Always returns a reference to a DataFrame\n", - "_ = data.rename(index={'OHIO': 'INDIANA'}, inplace=True)\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Discretization and binning" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = [18, 25, 35, 60, 100]\n", - "cats = pd.cut(ages, bins)\n", - "cats" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cats.labels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cats.levels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.value_counts(cats)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.cut(ages, [18, 26, 36, 61, 100], right=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']\n", - "pd.cut(ages, bins, labels=group_names)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = np.random.rand(20)\n", - "pd.cut(data, 4, precision=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = np.random.randn(1000) # Normally distributed\n", - "cats = pd.qcut(data, 4) # Cut into quartiles\n", - "cats" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.value_counts(cats)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Detecting and filtering outliers" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)\n", - "data = DataFrame(np.random.randn(1000, 4))\n", - "data.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "col = data[3]\n", - "col[np.abs(col) > 3]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[(np.abs(data) > 3).any(1)]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[np.abs(data) > 3] = np.sign(data) * 3\n", - "data.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Permutation and random sampling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame(np.arange(5 * 4).reshape((5, 4)))\n", - "sampler = np.random.permutation(5)\n", - "sampler" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.take(sampler)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.take(np.random.permutation(len(df))[:3])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bag = np.array([5, 7, -1, 6, 4])\n", - "sampler = np.random.randint(0, len(bag), size=10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sampler" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "draws = bag.take(sampler)\n", - "draws" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Computing indicator / dummy variables" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", - " 'data1': range(6)})\n", - "pd.get_dummies(df['key'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dummies = pd.get_dummies(df['key'], prefix='key')\n", - "df_with_dummy = df[['data1']].join(dummies)\n", - "df_with_dummy" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mnames = ['movie_id', 'title', 'genres']\n", - "movies = pd.read_table('ch07/movies.dat', sep='::', header=None,\n", - " names=mnames)\n", - "movies[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "genre_iter = (set(x.split('|')) for x in movies.genres)\n", - "genres = sorted(set.union(*genre_iter))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for i, gen in enumerate(movies.genres):\n", - " dummies.ix[i, gen.split('|')] = 1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "movies_windic = movies.join(dummies.add_prefix('Genre_'))\n", - "movies_windic.ix[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = np.random.rand(10)\n", - "values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n", - "pd.get_dummies(pd.cut(values, bins))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "String manipulation" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "String object methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val = 'a,b, guido'\n", - "val.split(',')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pieces = [x.strip() for x in val.split(',')]\n", - "pieces" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "first, second, third = pieces\n", - "first + '::' + second + '::' + third" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'::'.join(pieces)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'guido' in val" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.index(',')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.find(':')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.index(':')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.count(',')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.replace(',', '::')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.replace(',', '')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Regular expressions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import re\n", - "text = \"foo bar\\t baz \\tqux\"\n", - "re.split('\\s+', text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex = re.compile('\\s+')\n", - "regex.split(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex.findall(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "text = \"\"\"Dave dave@google.com\n", - "Steve steve@gmail.com\n", - "Rob rob@gmail.com\n", - "Ryan ryan@yahoo.com\n", - "\"\"\"\n", - "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'\n", - "\n", - "# re.IGNORECASE makes the regex case-insensitive\n", - "regex = re.compile(pattern, flags=re.IGNORECASE)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex.findall(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "m = regex.search(text)\n", - "m" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "text[m.start():m.end()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(regex.match(text))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(regex.sub('REDACTED', text))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'\n", - "regex = re.compile(pattern, flags=re.IGNORECASE)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "m = regex.match('wesm@bright.net')\n", - "m.groups()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex.findall(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(regex.sub(r'Username: \\1, Domain: \\2, Suffix: \\3', text))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex = re.compile(r\"\"\"\n", - " (?P[A-Z0-9._%+-]+)\n", - " @\n", - " (?P[A-Z0-9.-]+)\n", - " \\.\n", - " (?P[A-Z]{2,4})\"\"\", flags=re.IGNORECASE|re.VERBOSE)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "m = regex.match('wesm@bright.net')\n", - "m.groupdict()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Vectorized string functions in pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',\n", - " 'Rob': 'rob@gmail.com', 'Wes': np.nan}\n", - "data = Series(data)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.str.contains('gmail')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pattern" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.str.findall(pattern, flags=re.IGNORECASE)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "matches = data.str.match(pattern, flags=re.IGNORECASE)\n", - "matches" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "matches.str.get(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "matches.str[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.str[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Example: USDA Food Database" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "{\n", - " \"id\": 21441,\n", - " \"description\": \"KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,\n", - "Wing, meat and skin with breading\",\n", - " \"tags\": [\"KFC\"],\n", - " \"manufacturer\": \"Kentucky Fried Chicken\",\n", - " \"group\": \"Fast Foods\",\n", - " \"portions\": [\n", - " {\n", - " \"amount\": 1,\n", - " \"unit\": \"wing, with skin\",\n", - " \"grams\": 68.0\n", - " },\n", - "\n", - " ...\n", - " ],\n", - " \"nutrients\": [\n", - " {\n", - " \"value\": 20.8,\n", - " \"units\": \"g\",\n", - " \"description\": \"Protein\",\n", - " \"group\": \"Composition\"\n", - " },\n", - "\n", - " ...\n", - " ]\n", - "}" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import json\n", - "db = json.load(open('ch07/foods-2011-10-03.json'))\n", - "len(db)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "db[0].keys()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "db[0]['nutrients'][0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients = DataFrame(db[0]['nutrients'])\n", - "nutrients[:7]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "info_keys = ['description', 'group', 'id', 'manufacturer']\n", - "info = DataFrame(db, columns=info_keys)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "info[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "info" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.value_counts(info.group)[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients = []\n", - "\n", - "for rec in db:\n", - " fnuts = DataFrame(rec['nutrients'])\n", - " fnuts['id'] = rec['id']\n", - " nutrients.append(fnuts)\n", - "\n", - "nutrients = pd.concat(nutrients, ignore_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients.duplicated().sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients = nutrients.drop_duplicates()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "col_mapping = {'description' : 'food',\n", - " 'group' : 'fgroup'}\n", - "info = info.rename(columns=col_mapping, copy=False)\n", - "info" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "col_mapping = {'description' : 'nutrient',\n", - " 'group' : 'nutgroup'}\n", - "nutrients = nutrients.rename(columns=col_mapping, copy=False)\n", - "nutrients" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ndata = pd.merge(nutrients, info, on='id', how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ndata" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ndata.ix[30000]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)\n", - "result['Zinc, Zn'].order().plot(kind='barh')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])\n", - "\n", - "get_maximum = lambda x: x.xs(x.value.idxmax())\n", - "get_minimum = lambda x: x.xs(x.value.idxmin())\n", - "\n", - "max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]\n", - "\n", - "# make the food a little smaller\n", - "max_foods.food = max_foods.food.str[:50]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "max_foods.ix['Amino Acids']['food']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c17a129ffe73594b5d6b291a36d15584a422d1de Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 28 Oct 2015 19:07:23 -0700 Subject: [PATCH 5/9] Run through ch08 --- ch08.ipynb | 2000 +++++++++++++++++++++++++++------------------------- 1 file changed, 1023 insertions(+), 977 deletions(-) diff --git a/ch08.ipynb b/ch08.ipynb index 994cda63a..9f430e889 100644 --- a/ch08.ipynb +++ b/ch08.ipynb @@ -1,980 +1,1026 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Plotting and Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "np.random.seed(12345)\n", + "plt.rc('figure', figsize=(10, 6))\n", + "from pandas import Series, DataFrame\n", + "import pandas as pd\n", + "np.set_printoptions(precision=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pwd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## A brief matplotlib API primer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Figures and Subplots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig = plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ax1 = fig.add_subplot(2, 2, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ax2 = fig.add_subplot(2, 2, 2)\n", + "ax3 = fig.add_subplot(2, 2, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from numpy.random import randn\n", + "plt.plot(randn(50).cumsum(), 'k--')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "_ = ax1.hist(randn(100), bins=20, color='k', alpha=0.3)\n", + "ax2.scatter(np.arange(30), np.arange(30) + 3 * randn(30))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 3)\n", + "axes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Adjusting the spacing around subplots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.subplots_adjust(left=None, bottom=None, right=None, top=None,\n", + " wspace=None, hspace=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", + "for i in range(2):\n", + " for j in range(2):\n", + " axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)\n", + "plt.subplots_adjust(wspace=0, hspace=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", + "for i in range(2):\n", + " for j in range(2):\n", + " axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)\n", + "plt.subplots_adjust(wspace=0, hspace=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Colors, markers, and line styles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.plot(randn(30).cumsum(), 'ko--')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = randn(30).cumsum()\n", + "plt.plot(data, 'k--', label='Default')\n", + "plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')\n", + "plt.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ticks, labels, and legends" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Setting the title, axis labels, ticks, and ticklabels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)\n", + "ax.plot(randn(1000).cumsum())\n", + "\n", + "ticks = ax.set_xticks([0, 250, 500, 750, 1000])\n", + "labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],\n", + " rotation=30, fontsize='small')\n", + "ax.set_title('My first matplotlib plot')\n", + "ax.set_xlabel('Stages')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Adding legends" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)\n", + "ax.plot(randn(1000).cumsum(), 'k', label='one')\n", + "ax.plot(randn(1000).cumsum(), 'k--', label='two')\n", + "ax.plot(randn(1000).cumsum(), 'k.', label='three')\n", + "\n", + "ax.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Annotations and drawing on a subplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(1, 1, 1)\n", + "\n", + "data = pd.read_csv('ch08/spx.csv', index_col=0, parse_dates=True)\n", + "spx = data['SPX']\n", + "\n", + "spx.plot(ax=ax, style='k-')\n", + "\n", + "crisis_data = [\n", + " (datetime(2007, 10, 11), 'Peak of bull market'),\n", + " (datetime(2008, 3, 12), 'Bear Stearns Fails'),\n", + " (datetime(2008, 9, 15), 'Lehman Bankruptcy')\n", + "]\n", + "\n", + "for date, label in crisis_data:\n", + " ax.annotate(label, xy=(date, spx.asof(date) + 50),\n", + " xytext=(date, spx.asof(date) + 200),\n", + " arrowprops=dict(facecolor='black'),\n", + " horizontalalignment='left', verticalalignment='top')\n", + "\n", + "# Zoom in on 2007-2010\n", + "ax.set_xlim(['1/1/2007', '1/1/2011'])\n", + "ax.set_ylim([600, 1800])\n", + "\n", + "ax.set_title('Important dates in 2008-2009 financial crisis')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig = plt.figure()\n", + "ax = fig.add_subplot(1, 1, 1)\n", + "\n", + "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)\n", + "circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)\n", + "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],\n", + " color='g', alpha=0.5)\n", + "\n", + "ax.add_patch(rect)\n", + "ax.add_patch(circ)\n", + "ax.add_patch(pgon)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving plots to file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig.savefig('figpath.svg')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig.savefig('figpath.png', dpi=400, bbox_inches='tight')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from io import BytesIO\n", + "buffer = BytesIO()\n", + "plt.savefig(buffer)\n", + "plot_data = buffer.getvalue()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### matplotlib configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.rc('figure', figsize=(10, 10))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting functions in pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Line plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s = Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))\n", + "s.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame(np.random.randn(10, 4).cumsum(0),\n", + " columns=['A', 'B', 'C', 'D'],\n", + " index=np.arange(0, 100, 10))\n", + "df.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bar plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 1)\n", + "data = Series(np.random.rand(16), index=list('abcdefghijklmnop'))\n", + "data.plot(kind='bar', ax=axes[0], color='k', alpha=0.7)\n", + "data.plot(kind='barh', ax=axes[1], color='k', alpha=0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame(np.random.rand(6, 4),\n", + " index=['one', 'two', 'three', 'four', 'five', 'six'],\n", + " columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))\n", + "df\n", + "df.plot(kind='bar')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.plot(kind='barh', stacked=True, alpha=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips = pd.read_csv('ch08/tips.csv')\n", + "party_counts = pd.crosstab(tips.day, tips.size)\n", + "party_counts\n", + "# Not many 1- and 6-person parties\n", + "party_counts = party_counts.ix[:, 2:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Normalize to sum to 1\n", + "party_pcts = party_counts.div(party_counts.sum(1).astype(float), axis=0)\n", + "party_pcts\n", + "\n", + "party_pcts.plot(kind='bar', stacked=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Histograms and density plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips['tip_pct'] = tips['tip'] / tips['total_bill']\n", + "tips['tip_pct'].hist(bins=50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips['tip_pct'].plot(kind='kde')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "comp1 = np.random.normal(0, 1, size=200) # N(0, 1)\n", + "comp2 = np.random.normal(10, 2, size=200) # N(10, 4)\n", + "values = Series(np.concatenate([comp1, comp2]))\n", + "values.hist(bins=100, alpha=0.3, color='k', normed=True)\n", + "values.plot(kind='kde', style='k--')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scatter plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "macro = pd.read_csv('ch08/macrodata.csv')\n", + "data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]\n", + "trans_data = np.log(data).diff().dropna()\n", + "trans_data[-5:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.scatter(trans_data['m1'], trans_data['unemp'])\n", + "plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Plotting Maps: Visualizing Haiti Earthquake Crisis data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = pd.read_csv('ch08/Haiti.csv')\n", + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data['CATEGORY'][:6]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &\n", + " (data.LONGITUDE > -75) & (data.LONGITUDE < -70)\n", + " & data.CATEGORY.notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def to_cat_list(catstr):\n", + " stripped = (x.strip() for x in catstr.split(','))\n", + " return [x for x in stripped if x]\n", + "\n", + "def get_all_categories(cat_series):\n", + " cat_sets = (set(to_cat_list(x)) for x in cat_series)\n", + " return sorted(set.union(*cat_sets))\n", + "\n", + "def get_english(cat):\n", + " code, names = cat.split('.')\n", + " if '|' in names:\n", + " names = names.split(' | ')[1]\n", + " return code, names.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "get_english('2. Urgences logistiques | Vital Lines')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "all_cats = get_all_categories(data.CATEGORY)\n", + "# Generator expression\n", + "english_mapping = dict(get_english(x) for x in all_cats)\n", + "english_mapping['2a']\n", + "english_mapping['6c']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_code(seq):\n", + " return [x.split('.')[0] for x in seq if x]\n", + "\n", + "all_codes = get_code(all_cats)\n", + "code_index = pd.Index(np.unique(all_codes))\n", + "dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),\n", + " index=data.index, columns=code_index)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dummy_frame.ix[:, :6].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for row, cat in zip(data.index, data.CATEGORY):\n", + " codes = get_code(to_cat_list(cat))\n", + " dummy_frame.ix[row, codes] = 1\n", + "\n", + "data = data.join(dummy_frame.add_prefix('category_'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.ix[:, 10:15].info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from mpl_toolkits.basemap import Basemap\n", + "import matplotlib.pyplot as plt\n", + "\n", + "def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,\n", + " lllon=-75, urlon=-71):\n", + " # create polar stereographic Basemap instance.\n", + " m = Basemap(ax=ax, projection='stere',\n", + " lon_0=(urlon + lllon) / 2,\n", + " lat_0=(urlat + lllat) / 2,\n", + " llcrnrlat=lllat, urcrnrlat=urlat,\n", + " llcrnrlon=lllon, urcrnrlon=urlon,\n", + " resolution='f')\n", + " # draw coastlines, state and country boundaries, edge of map.\n", + " m.drawcoastlines()\n", + " m.drawstates()\n", + " m.drawcountries()\n", + " return m" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", + "fig.subplots_adjust(hspace=0.05, wspace=0.05)\n", + "\n", + "to_plot = ['2a', '1', '3c', '7a']\n", + "\n", + "lllat=17.25; urlat=20.25; lllon=-75; urlon=-71\n", + "\n", + "for code, ax in zip(to_plot, axes.flat):\n", + " m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,\n", + " lllon=lllon, urlon=urlon)\n", + "\n", + " cat_data = data[data['category_%s' % code] == 1]\n", + "\n", + " # compute map proj coordinates.\n", + " x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)\n", + "\n", + " m.plot(x, y, 'k.', alpha=0.5)\n", + " ax.set_title('%s: %s' % (code, english_mapping[code]))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", + "fig.subplots_adjust(hspace=0.05, wspace=0.05)\n", + "\n", + "to_plot = ['2a', '1', '3c', '7a']\n", + "\n", + "lllat=17.25; urlat=20.25; lllon=-75; urlon=-71\n", + "\n", + "def make_plot():\n", + "\n", + " for i, code in enumerate(to_plot):\n", + " cat_data = data[data['category_%s' % code] == 1]\n", + " lons, lats = cat_data.LONGITUDE, cat_data.LATITUDE\n", + "\n", + " ax = axes.flat[i]\n", + " m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,\n", + " lllon=lllon, urlon=urlon)\n", + "\n", + " # compute map proj coordinates.\n", + " x, y = m(lons.values, lats.values)\n", + "\n", + " m.plot(x, y, 'k.', alpha=0.5)\n", + " ax.set_title('%s: %s' % (code, english_mapping[code]))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "make_plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "shapefile_path = 'ch08/PortAuPrince_Roads/PortAuPrince_Roads'\n", + "m.readshapefile(shapefile_path, 'roads')" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:91cb49615ff73ea799022440811545d971f344925e863113d3a6e221eeb2e798" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Plotting and Visualization" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "np.random.seed(12345)\n", - "plt.rc('figure', figsize=(10, 6))\n", - "from pandas import Series, DataFrame\n", - "import pandas as pd\n", - "np.set_printoptions(precision=4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%matplotlib inline" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pwd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts/" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "A brief matplotlib API primer" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import matplotlib.pyplot as plt" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Figures and Subplots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ax1 = fig.add_subplot(2, 2, 1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ax2 = fig.add_subplot(2, 2, 2)\n", - "ax3 = fig.add_subplot(2, 2, 3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy.random import randn\n", - "plt.plot(randn(50).cumsum(), 'k--')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "_ = ax1.hist(randn(100), bins=20, color='k', alpha=0.3)\n", - "ax2.scatter(np.arange(30), np.arange(30) + 3 * randn(30))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(2, 3)\n", - "axes" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Adjusting the spacing around subplots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.subplots_adjust(left=None, bottom=None, right=None, top=None,\n", - " wspace=None, hspace=None)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", - "for i in range(2):\n", - " for j in range(2):\n", - " axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)\n", - "plt.subplots_adjust(wspace=0, hspace=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", - "for i in range(2):\n", - " for j in range(2):\n", - " axes[i, j].hist(randn(500), bins=50, color='k', alpha=0.5)\n", - "plt.subplots_adjust(wspace=0, hspace=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Colors, markers, and line styles" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.plot(randn(30).cumsum(), 'ko--')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = randn(30).cumsum()\n", - "plt.plot(data, 'k--', label='Default')\n", - "plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')\n", - "plt.legend(loc='best')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Ticks, labels, and legends" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Setting the title, axis labels, ticks, and ticklabels" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)\n", - "ax.plot(randn(1000).cumsum())\n", - "\n", - "ticks = ax.set_xticks([0, 250, 500, 750, 1000])\n", - "labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],\n", - " rotation=30, fontsize='small')\n", - "ax.set_title('My first matplotlib plot')\n", - "ax.set_xlabel('Stages')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Adding legends" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)\n", - "ax.plot(randn(1000).cumsum(), 'k', label='one')\n", - "ax.plot(randn(1000).cumsum(), 'k--', label='two')\n", - "ax.plot(randn(1000).cumsum(), 'k.', label='three')\n", - "\n", - "ax.legend(loc='best')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Annotations and drawing on a subplot" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime\n", - "\n", - "fig = plt.figure()\n", - "ax = fig.add_subplot(1, 1, 1)\n", - "\n", - "data = pd.read_csv('ch08/spx.csv', index_col=0, parse_dates=True)\n", - "spx = data['SPX']\n", - "\n", - "spx.plot(ax=ax, style='k-')\n", - "\n", - "crisis_data = [\n", - " (datetime(2007, 10, 11), 'Peak of bull market'),\n", - " (datetime(2008, 3, 12), 'Bear Stearns Fails'),\n", - " (datetime(2008, 9, 15), 'Lehman Bankruptcy')\n", - "]\n", - "\n", - "for date, label in crisis_data:\n", - " ax.annotate(label, xy=(date, spx.asof(date) + 50),\n", - " xytext=(date, spx.asof(date) + 200),\n", - " arrowprops=dict(facecolor='black'),\n", - " horizontalalignment='left', verticalalignment='top')\n", - "\n", - "# Zoom in on 2007-2010\n", - "ax.set_xlim(['1/1/2007', '1/1/2011'])\n", - "ax.set_ylim([600, 1800])\n", - "\n", - "ax.set_title('Important dates in 2008-2009 financial crisis')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()\n", - "ax = fig.add_subplot(1, 1, 1)\n", - "\n", - "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)\n", - "circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)\n", - "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],\n", - " color='g', alpha=0.5)\n", - "\n", - "ax.add_patch(rect)\n", - "ax.add_patch(circ)\n", - "ax.add_patch(pgon)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Saving plots to file" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig.savefig('figpath.svg')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig.savefig('figpath.png', dpi=400, bbox_inches='tight')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from io import BytesIO\n", - "buffer = BytesIO()\n", - "plt.savefig(buffer)\n", - "plot_data = buffer.getvalue()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "matplotlib configuration" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.rc('figure', figsize=(10, 10))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Plotting functions in pandas" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Line plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))\n", - "s.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame(np.random.randn(10, 4).cumsum(0),\n", - " columns=['A', 'B', 'C', 'D'],\n", - " index=np.arange(0, 100, 10))\n", - "df.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Bar plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(2, 1)\n", - "data = Series(np.random.rand(16), index=list('abcdefghijklmnop'))\n", - "data.plot(kind='bar', ax=axes[0], color='k', alpha=0.7)\n", - "data.plot(kind='barh', ax=axes[1], color='k', alpha=0.7)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame(np.random.rand(6, 4),\n", - " index=['one', 'two', 'three', 'four', 'five', 'six'],\n", - " columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))\n", - "df\n", - "df.plot(kind='bar')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.plot(kind='barh', stacked=True, alpha=0.5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips = pd.read_csv('ch08/tips.csv')\n", - "party_counts = pd.crosstab(tips.day, tips.size)\n", - "party_counts\n", - "# Not many 1- and 6-person parties\n", - "party_counts = party_counts.ix[:, 2:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Normalize to sum to 1\n", - "party_pcts = party_counts.div(party_counts.sum(1).astype(float), axis=0)\n", - "party_pcts\n", - "\n", - "party_pcts.plot(kind='bar', stacked=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Histograms and density plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips['tip_pct'] = tips['tip'] / tips['total_bill']\n", - "tips['tip_pct'].hist(bins=50)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips['tip_pct'].plot(kind='kde')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "comp1 = np.random.normal(0, 1, size=200) # N(0, 1)\n", - "comp2 = np.random.normal(10, 2, size=200) # N(10, 4)\n", - "values = Series(np.concatenate([comp1, comp2]))\n", - "values.hist(bins=100, alpha=0.3, color='k', normed=True)\n", - "values.plot(kind='kde', style='k--')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Scatter plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "macro = pd.read_csv('ch08/macrodata.csv')\n", - "data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]\n", - "trans_data = np.log(data).diff().dropna()\n", - "trans_data[-5:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.scatter(trans_data['m1'], trans_data['unemp'])\n", - "plt.title('Changes in log %s vs. log %s' % ('m1', 'unemp'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.scatter_matrix(trans_data, diagonal='kde', color='k', alpha=0.3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Plotting Maps: Visualizing Haiti Earthquake Crisis data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_csv('ch08/Haiti.csv')\n", - "data.info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['CATEGORY'][:6]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &\n", - " (data.LONGITUDE > -75) & (data.LONGITUDE < -70)\n", - " & data.CATEGORY.notnull()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def to_cat_list(catstr):\n", - " stripped = (x.strip() for x in catstr.split(','))\n", - " return [x for x in stripped if x]\n", - "\n", - "def get_all_categories(cat_series):\n", - " cat_sets = (set(to_cat_list(x)) for x in cat_series)\n", - " return sorted(set.union(*cat_sets))\n", - "\n", - "def get_english(cat):\n", - " code, names = cat.split('.')\n", - " if '|' in names:\n", - " names = names.split(' | ')[1]\n", - " return code, names.strip()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "get_english('2. Urgences logistiques | Vital Lines')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "all_cats = get_all_categories(data.CATEGORY)\n", - "# Generator expression\n", - "english_mapping = dict(get_english(x) for x in all_cats)\n", - "english_mapping['2a']\n", - "english_mapping['6c']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_code(seq):\n", - " return [x.split('.')[0] for x in seq if x]\n", - "\n", - "all_codes = get_code(all_cats)\n", - "code_index = pd.Index(np.unique(all_codes))\n", - "dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),\n", - " index=data.index, columns=code_index)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dummy_frame.ix[:, :6].info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for row, cat in zip(data.index, data.CATEGORY):\n", - " codes = get_code(to_cat_list(cat))\n", - " dummy_frame.ix[row, codes] = 1\n", - "\n", - "data = data.join(dummy_frame.add_prefix('category_'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.ix[:, 10:15].info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from mpl_toolkits.basemap import Basemap\n", - "import matplotlib.pyplot as plt\n", - "\n", - "def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,\n", - " lllon=-75, urlon=-71):\n", - " # create polar stereographic Basemap instance.\n", - " m = Basemap(ax=ax, projection='stere',\n", - " lon_0=(urlon + lllon) / 2,\n", - " lat_0=(urlat + lllat) / 2,\n", - " llcrnrlat=lllat, urcrnrlat=urlat,\n", - " llcrnrlon=lllon, urcrnrlon=urlon,\n", - " resolution='f')\n", - " # draw coastlines, state and country boundaries, edge of map.\n", - " m.drawcoastlines()\n", - " m.drawstates()\n", - " m.drawcountries()\n", - " return m" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", - "fig.subplots_adjust(hspace=0.05, wspace=0.05)\n", - "\n", - "to_plot = ['2a', '1', '3c', '7a']\n", - "\n", - "lllat=17.25; urlat=20.25; lllon=-75; urlon=-71\n", - "\n", - "for code, ax in zip(to_plot, axes.flat):\n", - " m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,\n", - " lllon=lllon, urlon=urlon)\n", - "\n", - " cat_data = data[data['category_%s' % code] == 1]\n", - "\n", - " # compute map proj coordinates.\n", - " x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)\n", - "\n", - " m.plot(x, y, 'k.', alpha=0.5)\n", - " ax.set_title('%s: %s' % (code, english_mapping[code]))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))\n", - "fig.subplots_adjust(hspace=0.05, wspace=0.05)\n", - "\n", - "to_plot = ['2a', '1', '3c', '7a']\n", - "\n", - "lllat=17.25; urlat=20.25; lllon=-75; urlon=-71\n", - "\n", - "def make_plot():\n", - "\n", - " for i, code in enumerate(to_plot):\n", - " cat_data = data[data['category_%s' % code] == 1]\n", - " lons, lats = cat_data.LONGITUDE, cat_data.LATITUDE\n", - "\n", - " ax = axes.flat[i]\n", - " m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,\n", - " lllon=lllon, urlon=urlon)\n", - "\n", - " # compute map proj coordinates.\n", - " x, y = m(lons.values, lats.values)\n", - "\n", - " m.plot(x, y, 'k.', alpha=0.5)\n", - " ax.set_title('%s: %s' % (code, english_mapping[code]))\n", - " " - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "make_plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "shapefile_path = 'ch08/PortAuPrince_Roads/PortAuPrince_Roads'\n", - "m.readshapefile(shapefile_path, 'roads')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From c8a8f40b7a13a4175d905401ad1cc40300987eec Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 6 Nov 2015 14:16:15 -0800 Subject: [PATCH 6/9] Run through c09 --- ch09.ipynb | 3255 +++++++++++++++++++++++++++------------------------- 1 file changed, 1674 insertions(+), 1581 deletions(-) diff --git a/ch09.ipynb b/ch09.ipynb index a17490851..1cd1a8756 100644 --- a/ch09.ipynb +++ b/ch09.ipynb @@ -1,1584 +1,1677 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Aggregation and Group Operations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "np.random.seed(12345)\n", + "plt.rc('figure', figsize=(10, 6))\n", + "from pandas import Series, DataFrame\n", + "import pandas as pd\n", + "np.set_printoptions(precision=4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.options.display.notebook_repr_html = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GroupBy mechanics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],\n", + " 'key2' : ['one', 'two', 'one', 'two', 'one'],\n", + " 'data1' : np.random.randn(5),\n", + " 'data2' : np.random.randn(5)})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = df['data1'].groupby(df['key1'])\n", + "grouped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "means = df['data1'].groupby([df['key1'], df['key2']]).mean()\n", + "means" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "means.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])\n", + "years = np.array([2005, 2005, 2006, 2005, 2006])\n", + "df['data1'].groupby([states, years]).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.groupby('key1').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.groupby(['key1', 'key2']).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.groupby(['key1', 'key2']).size()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Iterating over groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for name, group in df.groupby('key1'):\n", + " print(name)\n", + " print(group)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for (k1, k2), group in df.groupby(['key1', 'key2']):\n", + " print((k1, k2))\n", + " print(group)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pieces = dict(list(df.groupby('key1')))\n", + "pieces['b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = df.groupby(df.dtypes, axis=1)\n", + "dict(list(grouped))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Selecting a column or subset of columns" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "df.groupby('key1')['data1']\n", + "df.groupby('key1')[['data2']]" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "df['data1'].groupby(df['key1'])\n", + "df[['data2']].groupby(df['key1'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df.groupby(['key1', 'key2'])[['data2']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s_grouped = df.groupby(['key1', 'key2'])['data2']\n", + "s_grouped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s_grouped.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grouping with dicts and Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "people = DataFrame(np.random.randn(5, 5),\n", + " columns=['a', 'b', 'c', 'd', 'e'],\n", + " index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])\n", + "people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values\n", + "people" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mapping = {'a': 'red', 'b': 'red', 'c': 'blue',\n", + " 'd': 'blue', 'e': 'red', 'f' : 'orange'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_column = people.groupby(mapping, axis=1)\n", + "by_column.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "map_series = Series(mapping)\n", + "map_series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "people.groupby(map_series, axis=1).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grouping with functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "people.groupby(len).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "key_list = ['one', 'one', 'one', 'two', 'two']\n", + "people.groupby([len, key_list]).min()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Grouping by index levels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],\n", + " [1, 3, 5, 1, 3]], names=['cty', 'tenor'])\n", + "hier_df = DataFrame(np.random.randn(4, 5), columns=columns)\n", + "hier_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "hier_df.groupby(level='cty', axis=1).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = df.groupby('key1')\n", + "grouped['data1'].quantile(0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def peak_to_peak(arr):\n", + " return arr.max() - arr.min()\n", + "grouped.agg(peak_to_peak)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips = pd.read_csv('ch08/tips.csv')\n", + "# Add tip percentage of total bill\n", + "tips['tip_pct'] = tips['tip'] / tips['total_bill']\n", + "tips[:6]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column-wise and multiple function application" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = tips.groupby(['sex', 'smoker'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped_pct = grouped['tip_pct']\n", + "grouped_pct.agg('mean')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped_pct.agg(['mean', 'std', peak_to_peak])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "functions = ['count', 'mean', 'max']\n", + "result = grouped['tip_pct', 'total_bill'].agg(functions)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result['tip_pct']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]\n", + "grouped['tip_pct', 'total_bill'].agg(ftuples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped.agg({'tip' : np.max, 'size' : 'sum'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],\n", + " 'size' : 'sum'})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Returning aggregated data in \"unindexed\" form" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.groupby(['sex', 'smoker'], as_index=False).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Group-wise operations and transformations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "k1_means = df.groupby('key1').mean().add_prefix('mean_')\n", + "k1_means" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.merge(df, k1_means, left_on='key1', right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "key = ['one', 'two', 'one', 'two', 'one']\n", + "people.groupby(key).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "people.groupby(key).transform(np.mean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def demean(arr):\n", + " return arr - arr.mean()\n", + "demeaned = people.groupby(key).transform(demean)\n", + "demeaned" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "demeaned.groupby(key).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Apply: General split-apply-combine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def top(df, n=5, column='tip_pct'):\n", + " return df.sort_index(by=column)[-n:]\n", + "top(tips, n=6)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.groupby('smoker').apply(top)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = tips.groupby('smoker')['tip_pct'].describe()\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result.unstack('smoker')" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "f = lambda x: x.describe()\n", + "grouped.apply(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Suppressing the group keys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.groupby('smoker', group_keys=False).apply(top)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantile and bucket analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame({'data1': np.random.randn(1000),\n", + " 'data2': np.random.randn(1000)})\n", + "factor = pd.cut(frame.data1, 4)\n", + "factor[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_stats(group):\n", + " return {'min': group.min(), 'max': group.max(),\n", + " 'count': group.count(), 'mean': group.mean()}\n", + "\n", + "grouped = frame.data2.groupby(factor)\n", + "grouped.apply(get_stats).unstack()\n", + "\n", + "#ADAPT the output is not sorted in the book while this is the case now (swap first two lines)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Return quantile numbers\n", + "grouping = pd.qcut(frame.data1, 10, labels=False)\n", + "\n", + "grouped = frame.data2.groupby(grouping)\n", + "grouped.apply(get_stats).unstack()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example: Filling missing values with group-specific values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s = Series(np.random.randn(6))\n", + "s[::2] = np.nan\n", + "s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s.fillna(s.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "states = ['Ohio', 'New York', 'Vermont', 'Florida',\n", + " 'Oregon', 'Nevada', 'California', 'Idaho']\n", + "group_key = ['East'] * 4 + ['West'] * 4\n", + "data = Series(np.random.randn(8), index=states)\n", + "data[['Vermont', 'Nevada', 'Idaho']] = np.nan\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.groupby(group_key).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fill_mean = lambda g: g.fillna(g.mean())\n", + "data.groupby(group_key).apply(fill_mean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fill_values = {'East': 0.5, 'West': -1}\n", + "fill_func = lambda g: g.fillna(fill_values[g.name])\n", + "\n", + "data.groupby(group_key).apply(fill_func)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example: Random sampling and permutation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Hearts, Spades, Clubs, Diamonds\n", + "suits = ['H', 'S', 'C', 'D']\n", + "card_val = (range(1, 11) + [10] * 3) * 4\n", + "base_names = ['A'] + range(2, 11) + ['J', 'K', 'Q']\n", + "cards = []\n", + "for suit in ['H', 'S', 'C', 'D']:\n", + " cards.extend(str(num) + suit for num in base_names)\n", + "\n", + "deck = Series(card_val, index=cards)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "deck[:13]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def draw(deck, n=5):\n", + " return deck.take(np.random.permutation(len(deck))[:n])\n", + "draw(deck)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "get_suit = lambda card: card[-1] # last letter is suit\n", + "deck.groupby(get_suit).apply(draw, n=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# alternatively\n", + "deck.groupby(get_suit, group_keys=False).apply(draw, n=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example: Group weighted average and correlation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],\n", + " 'data': np.random.randn(8),\n", + " 'weights': np.random.rand(8)})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = df.groupby('category')\n", + "get_wavg = lambda g: np.average(g['data'], weights=g['weights'])\n", + "grouped.apply(get_wavg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px = pd.read_csv('ch09/stock_px.csv', parse_dates=True, index_col=0)\n", + "close_px.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px[-4:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rets = close_px.pct_change().dropna()\n", + "spx_corr = lambda x: x.corrwith(x['SPX'])\n", + "by_year = rets.groupby(lambda x: x.year)\n", + "by_year.apply(spx_corr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Annual correlation of Apple with Microsoft\n", + "by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Example: Group-wise linear regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import statsmodels.api as sm\n", + "def regress(data, yvar, xvars):\n", + " Y = data[yvar]\n", + " X = data[xvars]\n", + " X['intercept'] = 1.\n", + " result = sm.OLS(Y, X).fit()\n", + " return result.params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_year.apply(regress, 'AAPL', ['SPX'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pivot tables and Cross-tabulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.pivot_table(index=['sex', 'smoker'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],\n", + " columns='smoker')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],\n", + " columns='smoker', margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',\n", + " aggfunc=len, margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tips.pivot_table('size', index=['time', 'sex', 'smoker'],\n", + " columns='day', aggfunc='sum', fill_value=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Cross-tabulations: crosstab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from StringIO import StringIO\n", + "data = \"\"\"\\\n", + "Sample Gender Handedness\n", + "1 Female Right-handed\n", + "2 Male Left-handed\n", + "3 Female Right-handed\n", + "4 Male Right-handed\n", + "5 Male Left-handed\n", + "6 Male Right-handed\n", + "7 Female Right-handed\n", + "8 Female Left-handed\n", + "9 Male Right-handed\n", + "10 Female Right-handed\"\"\"\n", + "data = pd.read_table(StringIO(data), sep='\\s+')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.crosstab(data.Gender, data.Handedness, margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Example: 2012 Federal Election Commission Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec = pd.read_csv('ch09/P00000001-ALL.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec.ix[123456]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "unique_cands = fec.cand_nm.unique()\n", + "unique_cands" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "unique_cands[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "parties = {'Bachmann, Michelle': 'Republican',\n", + " 'Cain, Herman': 'Republican',\n", + " 'Gingrich, Newt': 'Republican',\n", + " 'Huntsman, Jon': 'Republican',\n", + " 'Johnson, Gary Earl': 'Republican',\n", + " 'McCotter, Thaddeus G': 'Republican',\n", + " 'Obama, Barack': 'Democrat',\n", + " 'Paul, Ron': 'Republican',\n", + " 'Pawlenty, Timothy': 'Republican',\n", + " 'Perry, Rick': 'Republican',\n", + " \"Roemer, Charles E. 'Buddy' III\": 'Republican',\n", + " 'Romney, Mitt': 'Republican',\n", + " 'Santorum, Rick': 'Republican'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec.cand_nm[123456:123461]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec.cand_nm[123456:123461].map(parties)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Add it as a column\n", + "fec['party'] = fec.cand_nm.map(parties)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec['party'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "(fec.contb_receipt_amt > 0).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec = fec[fec.contb_receipt_amt > 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Donation statistics by occupation and employer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fec.contbr_occupation.value_counts()[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "occ_mapping = {\n", + " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", + " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", + " 'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',\n", + " 'C.E.O.': 'CEO'\n", + "}\n", + "\n", + "# If no mapping provided, return x\n", + "f = lambda x: occ_mapping.get(x, x)\n", + "fec.contbr_occupation = fec.contbr_occupation.map(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "emp_mapping = {\n", + " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", + " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", + " 'SELF' : 'SELF-EMPLOYED',\n", + " 'SELF EMPLOYED' : 'SELF-EMPLOYED',\n", + "}\n", + "\n", + "# If no mapping provided, return x\n", + "f = lambda x: emp_mapping.get(x, x)\n", + "fec.contbr_employer = fec.contbr_employer.map(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_occupation = fec.pivot_table('contb_receipt_amt',\n", + " index='contbr_occupation',\n", + " columns='party', aggfunc='sum')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "over_2mm = by_occupation[by_occupation.sum(1) > 2000000]\n", + "over_2mm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "over_2mm.plot(kind='barh')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_top_amounts(group, key, n=5):\n", + " totals = group.groupby(key)['contb_receipt_amt'].sum()\n", + "\n", + " # Order totals by key in descending order\n", + " return totals.order(ascending=False)[-n:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = fec_mrbo.groupby('cand_nm')\n", + "grouped.apply(get_top_amounts, 'contbr_occupation', n=7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped.apply(get_top_amounts, 'contbr_employer', n=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Bucketing donation amounts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])\n", + "labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = fec_mrbo.groupby(['cand_nm', labels])\n", + "grouped.size().unstack(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)\n", + "bucket_sums" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)\n", + "normed_sums" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "normed_sums[:-2].plot(kind='barh', stacked=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Donation statistics by state" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])\n", + "totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)\n", + "totals = totals[totals.sum(1) > 100000]\n", + "totals[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "percent = totals.div(totals.sum(1), axis=0)\n", + "percent[:10]" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:884d45c0ec888c81923e3e89bc579b89c2b34c7a37578b0a064bb5dcfd9ffa29" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Data Aggregation and Group Operations" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "import os\n", - "import matplotlib.pyplot as plt\n", - "np.random.seed(12345)\n", - "plt.rc('figure', figsize=(10, 6))\n", - "from pandas import Series, DataFrame\n", - "import pandas as pd\n", - "np.set_printoptions(precision=4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.notebook_repr_html = False" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%matplotlib inline" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "GroupBy mechanics" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],\n", - " 'key2' : ['one', 'two', 'one', 'two', 'one'],\n", - " 'data1' : np.random.randn(5),\n", - " 'data2' : np.random.randn(5)})\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = df['data1'].groupby(df['key1'])\n", - "grouped" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "means = df['data1'].groupby([df['key1'], df['key2']]).mean()\n", - "means" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "means.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])\n", - "years = np.array([2005, 2005, 2006, 2005, 2006])\n", - "df['data1'].groupby([states, years]).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.groupby('key1').mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.groupby(['key1', 'key2']).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.groupby(['key1', 'key2']).size()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Iterating over groups" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for name, group in df.groupby('key1'):\n", - " print(name)\n", - " print(group)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for (k1, k2), group in df.groupby(['key1', 'key2']):\n", - " print((k1, k2))\n", - " print(group)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pieces = dict(list(df.groupby('key1')))\n", - "pieces['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.dtypes" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = df.groupby(df.dtypes, axis=1)\n", - "dict(list(grouped))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Selecting a column or subset of columns" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "df.groupby('key1')['data1']\n", - "df.groupby('key1')[['data2']]" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "df['data1'].groupby(df['key1'])\n", - "df[['data2']].groupby(df['key1'])" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.groupby(['key1', 'key2'])[['data2']].mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s_grouped = df.groupby(['key1', 'key2'])['data2']\n", - "s_grouped" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s_grouped.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Grouping with dicts and Series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "people = DataFrame(np.random.randn(5, 5),\n", - " columns=['a', 'b', 'c', 'd', 'e'],\n", - " index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])\n", - "people.ix[2:3, ['b', 'c']] = np.nan # Add a few NA values\n", - "people" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mapping = {'a': 'red', 'b': 'red', 'c': 'blue',\n", - " 'd': 'blue', 'e': 'red', 'f' : 'orange'}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_column = people.groupby(mapping, axis=1)\n", - "by_column.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "map_series = Series(mapping)\n", - "map_series" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "people.groupby(map_series, axis=1).count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Grouping with functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "people.groupby(len).sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "key_list = ['one', 'one', 'one', 'two', 'two']\n", - "people.groupby([len, key_list]).min()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Grouping by index levels" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],\n", - " [1, 3, 5, 1, 3]], names=['cty', 'tenor'])\n", - "hier_df = DataFrame(np.random.randn(4, 5), columns=columns)\n", - "hier_df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "hier_df.groupby(level='cty', axis=1).count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data aggregation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = df.groupby('key1')\n", - "grouped['data1'].quantile(0.9)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def peak_to_peak(arr):\n", - " return arr.max() - arr.min()\n", - "grouped.agg(peak_to_peak)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips = pd.read_csv('ch08/tips.csv')\n", - "# Add tip percentage of total bill\n", - "tips['tip_pct'] = tips['tip'] / tips['total_bill']\n", - "tips[:6]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Column-wise and multiple function application" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = tips.groupby(['sex', 'smoker'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped_pct = grouped['tip_pct']\n", - "grouped_pct.agg('mean')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped_pct.agg(['mean', 'std', peak_to_peak])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "functions = ['count', 'mean', 'max']\n", - "result = grouped['tip_pct', 'total_bill'].agg(functions)\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result['tip_pct']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]\n", - "grouped['tip_pct', 'total_bill'].agg(ftuples)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.agg({'tip' : np.max, 'size' : 'sum'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],\n", - " 'size' : 'sum'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Returning aggregated data in \"unindexed\" form" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby(['sex', 'smoker'], as_index=False).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Group-wise operations and transformations" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "k1_means = df.groupby('key1').mean().add_prefix('mean_')\n", - "k1_means" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df, k1_means, left_on='key1', right_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "key = ['one', 'two', 'one', 'two', 'one']\n", - "people.groupby(key).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "people.groupby(key).transform(np.mean)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def demean(arr):\n", - " return arr - arr.mean()\n", - "demeaned = people.groupby(key).transform(demean)\n", - "demeaned" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "demeaned.groupby(key).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Apply: General split-apply-combine" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def top(df, n=5, column='tip_pct'):\n", - " return df.sort_index(by=column)[-n:]\n", - "top(tips, n=6)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby('smoker').apply(top)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = tips.groupby('smoker')['tip_pct'].describe()\n", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result.unstack('smoker')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "f = lambda x: x.describe()\n", - "grouped.apply(f)" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Suppressing the group keys" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby('smoker', group_keys=False).apply(top)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Quantile and bucket analysis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame({'data1': np.random.randn(1000),\n", - " 'data2': np.random.randn(1000)})\n", - "factor = pd.cut(frame.data1, 4)\n", - "factor[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_stats(group):\n", - " return {'min': group.min(), 'max': group.max(),\n", - " 'count': group.count(), 'mean': group.mean()}\n", - "\n", - "grouped = frame.data2.groupby(factor)\n", - "grouped.apply(get_stats).unstack()\n", - "\n", - "#ADAPT the output is not sorted in the book while this is the case now (swap first two lines)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Return quantile numbers\n", - "grouping = pd.qcut(frame.data1, 10, labels=False)\n", - "\n", - "grouped = frame.data2.groupby(grouping)\n", - "grouped.apply(get_stats).unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Filling missing values with group-specific values" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = Series(np.random.randn(6))\n", - "s[::2] = np.nan\n", - "s" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s.fillna(s.mean())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = ['Ohio', 'New York', 'Vermont', 'Florida',\n", - " 'Oregon', 'Nevada', 'California', 'Idaho']\n", - "group_key = ['East'] * 4 + ['West'] * 4\n", - "data = Series(np.random.randn(8), index=states)\n", - "data[['Vermont', 'Nevada', 'Idaho']] = np.nan\n", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.groupby(group_key).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fill_mean = lambda g: g.fillna(g.mean())\n", - "data.groupby(group_key).apply(fill_mean)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fill_values = {'East': 0.5, 'West': -1}\n", - "fill_func = lambda g: g.fillna(fill_values[g.name])\n", - "\n", - "data.groupby(group_key).apply(fill_func)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Random sampling and permutation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Hearts, Spades, Clubs, Diamonds\n", - "suits = ['H', 'S', 'C', 'D']\n", - "card_val = (range(1, 11) + [10] * 3) * 4\n", - "base_names = ['A'] + range(2, 11) + ['J', 'K', 'Q']\n", - "cards = []\n", - "for suit in ['H', 'S', 'C', 'D']:\n", - " cards.extend(str(num) + suit for num in base_names)\n", - "\n", - "deck = Series(card_val, index=cards)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "deck[:13]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def draw(deck, n=5):\n", - " return deck.take(np.random.permutation(len(deck))[:n])\n", - "draw(deck)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "get_suit = lambda card: card[-1] # last letter is suit\n", - "deck.groupby(get_suit).apply(draw, n=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# alternatively\n", - "deck.groupby(get_suit, group_keys=False).apply(draw, n=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Group weighted average and correlation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],\n", - " 'data': np.random.randn(8),\n", - " 'weights': np.random.rand(8)})\n", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = df.groupby('category')\n", - "get_wavg = lambda g: np.average(g['data'], weights=g['weights'])\n", - "grouped.apply(get_wavg)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px = pd.read_csv('ch09/stock_px.csv', parse_dates=True, index_col=0)\n", - "close_px.info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px[-4:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rets = close_px.pct_change().dropna()\n", - "spx_corr = lambda x: x.corrwith(x['SPX'])\n", - "by_year = rets.groupby(lambda x: x.year)\n", - "by_year.apply(spx_corr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Annual correlation of Apple with Microsoft\n", - "by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Group-wise linear regression" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import statsmodels.api as sm\n", - "def regress(data, yvar, xvars):\n", - " Y = data[yvar]\n", - " X = data[xvars]\n", - " X['intercept'] = 1.\n", - " result = sm.OLS(Y, X).fit()\n", - " return result.params" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_year.apply(regress, 'AAPL', ['SPX'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Pivot tables and Cross-tabulation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table(index=['sex', 'smoker'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],\n", - " columns='smoker')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table(['tip_pct', 'size'], index=['sex', 'day'],\n", - " columns='smoker', margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table('tip_pct', index=['sex', 'smoker'], columns='day',\n", - " aggfunc=len, margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table('size', index=['time', 'sex', 'smoker'],\n", - " columns='day', aggfunc='sum', fill_value=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Cross-tabulations: crosstab" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from StringIO import StringIO\n", - "data = \"\"\"\\\n", - "Sample Gender Handedness\n", - "1 Female Right-handed\n", - "2 Male Left-handed\n", - "3 Female Right-handed\n", - "4 Male Right-handed\n", - "5 Male Left-handed\n", - "6 Male Right-handed\n", - "7 Female Right-handed\n", - "8 Female Left-handed\n", - "9 Male Right-handed\n", - "10 Female Right-handed\"\"\"\n", - "data = pd.read_table(StringIO(data), sep='\\s+')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.crosstab(data.Gender, data.Handedness, margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Example: 2012 Federal Election Commission Database" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec = pd.read_csv('ch09/P00000001-ALL.csv')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.ix[123456]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "unique_cands = fec.cand_nm.unique()\n", - "unique_cands" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "unique_cands[2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "parties = {'Bachmann, Michelle': 'Republican',\n", - " 'Cain, Herman': 'Republican',\n", - " 'Gingrich, Newt': 'Republican',\n", - " 'Huntsman, Jon': 'Republican',\n", - " 'Johnson, Gary Earl': 'Republican',\n", - " 'McCotter, Thaddeus G': 'Republican',\n", - " 'Obama, Barack': 'Democrat',\n", - " 'Paul, Ron': 'Republican',\n", - " 'Pawlenty, Timothy': 'Republican',\n", - " 'Perry, Rick': 'Republican',\n", - " \"Roemer, Charles E. 'Buddy' III\": 'Republican',\n", - " 'Romney, Mitt': 'Republican',\n", - " 'Santorum, Rick': 'Republican'}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.cand_nm[123456:123461]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.cand_nm[123456:123461].map(parties)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Add it as a column\n", - "fec['party'] = fec.cand_nm.map(parties)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec['party'].value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "(fec.contb_receipt_amt > 0).value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec = fec[fec.contb_receipt_amt > 0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Donation statistics by occupation and employer" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.contbr_occupation.value_counts()[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "occ_mapping = {\n", - " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", - " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", - " 'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',\n", - " 'C.E.O.': 'CEO'\n", - "}\n", - "\n", - "# If no mapping provided, return x\n", - "f = lambda x: occ_mapping.get(x, x)\n", - "fec.contbr_occupation = fec.contbr_occupation.map(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "emp_mapping = {\n", - " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", - " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", - " 'SELF' : 'SELF-EMPLOYED',\n", - " 'SELF EMPLOYED' : 'SELF-EMPLOYED',\n", - "}\n", - "\n", - "# If no mapping provided, return x\n", - "f = lambda x: emp_mapping.get(x, x)\n", - "fec.contbr_employer = fec.contbr_employer.map(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_occupation = fec.pivot_table('contb_receipt_amt',\n", - " index='contbr_occupation',\n", - " columns='party', aggfunc='sum')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "over_2mm = by_occupation[by_occupation.sum(1) > 2000000]\n", - "over_2mm" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "over_2mm.plot(kind='barh')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_top_amounts(group, key, n=5):\n", - " totals = group.groupby(key)['contb_receipt_amt'].sum()\n", - "\n", - " # Order totals by key in descending order\n", - " return totals.order(ascending=False)[-n:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = fec_mrbo.groupby('cand_nm')\n", - "grouped.apply(get_top_amounts, 'contbr_occupation', n=7)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.apply(get_top_amounts, 'contbr_employer', n=10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Bucketing donation amounts" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = np.array([0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000])\n", - "labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)\n", - "labels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = fec_mrbo.groupby(['cand_nm', labels])\n", - "grouped.size().unstack(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)\n", - "bucket_sums" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)\n", - "normed_sums" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "normed_sums[:-2].plot(kind='barh', stacked=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Donation statistics by state" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])\n", - "totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)\n", - "totals = totals[totals.sum(1) > 100000]\n", - "totals[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "percent = totals.div(totals.sum(1), axis=0)\n", - "percent[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 96f4872f3d91b4b2fb1a330913e8d4a8c70d7594 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 6 Nov 2015 14:20:12 -0800 Subject: [PATCH 7/9] Run through ch10 --- ch10.ipynb | 4344 +++++++++++++++++++++++++++------------------------- 1 file changed, 2239 insertions(+), 2105 deletions(-) diff --git a/ch10.ipynb b/ch10.ipynb index fdda16547..3a882fcdd 100644 --- a/ch10.ipynb +++ b/ch10.ipynb @@ -1,2108 +1,2242 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Time series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from pandas import Series, DataFrame\n", + "import pandas as pd\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "pd.options.display.max_rows = 12\n", + "np.set_printoptions(precision=4, suppress=True)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(12, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Date and Time Data Types and Tools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "now = datetime.now()\n", + "now" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "now.year, now.month, now.day" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)\n", + "delta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "delta.days" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "delta.seconds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "start = datetime(2011, 1, 7)\n", + "start + timedelta(12)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "start - 2 * timedelta(12)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Converting between string and datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp = datetime(2011, 1, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "str(stamp)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp.strftime('%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "value = '2011-01-03'\n", + "datetime.strptime(value, '%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "datestrs = ['7/6/2011', '8/6/2011']\n", + "[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from dateutil.parser import parse\n", + "parse('2011-01-03')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "parse('Jan 31, 1997 10:45 PM')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "parse('6/12/2011', dayfirst=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "datestrs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.to_datetime(datestrs)\n", + "# note: output changed (no '00:00:00' anymore)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "idx = pd.to_datetime(datestrs + [None])\n", + "idx" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "idx[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.isnull(idx)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time Series Basics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),\n", + " datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]\n", + "ts = Series(np.random.randn(6), index=dates)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "type(ts)\n", + "# note: output changed to \"pandas.core.series.Series\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts + ts[::2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.index.dtype\n", + "# note: output changed from dtype('datetime64[ns]') to dtype(' to Timestamp('2011-01-02 00:00:00')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indexing, selection, subsetting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp = ts.index[2]\n", + "ts[stamp]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts['1/10/2011']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts['20110110']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "longer_ts = Series(np.random.randn(1000),\n", + " index=pd.date_range('1/1/2000', periods=1000))\n", + "longer_ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "longer_ts['2001']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "longer_ts['2001-05']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts[datetime(2011, 1, 7):]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts['1/6/2011':'1/11/2011']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.truncate(after='1/9/2011')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')\n", + "long_df = DataFrame(np.random.randn(100, 4),\n", + " index=dates,\n", + " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", + "long_df.ix['5-2001']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Time series with duplicate indices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000',\n", + " '1/3/2000'])\n", + "dup_ts = Series(np.arange(5), index=dates)\n", + "dup_ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dup_ts.index.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dup_ts['1/3/2000'] # not duplicated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dup_ts['1/2/2000'] # duplicated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped = dup_ts.groupby(level=0)\n", + "grouped.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "grouped.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Date ranges, Frequencies, and Shifting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('D')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generating date ranges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "index = pd.date_range('4/1/2012', '6/1/2012')\n", + "index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range(start='4/1/2012', periods=20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range(end='6/1/2012', periods=20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range('1/1/2000', '12/1/2000', freq='BM')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range('5/2/2012 12:56:31', periods=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range('5/2/2012 12:56:31', periods=5, normalize=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Frequencies and Date Offsets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas.tseries.offsets import Hour, Minute\n", + "hour = Hour()\n", + "hour" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "four_hours = Hour(4)\n", + "four_hours" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4h')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "Hour(2) + Minute(30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range('1/1/2000', periods=10, freq='1h30min')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Week of month dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI')\n", + "list(rng)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Shifting (leading and lagging) data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts = Series(np.random.randn(4),\n", + " index=pd.date_range('1/1/2000', periods=4, freq='M'))\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.shift(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.shift(-2)" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "ts / ts.shift(1) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.shift(2, freq='M')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.shift(3, freq='D')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.shift(1, freq='3D')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.shift(1, freq='90T')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Shifting dates with offsets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from pandas.tseries.offsets import Day, MonthEnd\n", + "now = datetime(2011, 11, 17)\n", + "now + 3 * Day()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "now + MonthEnd()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "now + MonthEnd(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "offset = MonthEnd()\n", + "offset.rollforward(now)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "offset.rollback(now)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts = Series(np.random.randn(20),\n", + " index=pd.date_range('1/15/2000', periods=20, freq='4d'))\n", + "ts.groupby(offset.rollforward).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('M', how='mean')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time Zone Handling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pytz\n", + "pytz.common_timezones[-5:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "tz = pytz.timezone('US/Eastern')\n", + "tz" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Localization and Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')\n", + "ts = Series(np.random.randn(len(rng)), index=rng)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(ts.index.tz)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts_utc = ts.tz_localize('UTC')\n", + "ts_utc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts_utc.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts_utc.tz_convert('US/Eastern')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts_eastern = ts.tz_localize('US/Eastern')\n", + "ts_eastern.tz_convert('UTC')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts_eastern.tz_convert('Europe/Berlin')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.index.tz_localize('Asia/Shanghai')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operations with time zone-aware Timestamp objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp = pd.Timestamp('2011-03-12 04:00')\n", + "stamp_utc = stamp.tz_localize('utc')\n", + "stamp_utc.tz_convert('US/Eastern')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')\n", + "stamp_moscow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp_utc.value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp_utc.tz_convert('US/Eastern').value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 30 minutes before DST transition\n", + "from pandas.tseries.offsets import Hour\n", + "stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')\n", + "stamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp + Hour()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 90 minutes before DST transition\n", + "stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')\n", + "stamp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "stamp + 2 * Hour()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operations between different time zones" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')\n", + "ts = Series(np.random.randn(len(rng)), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts1 = ts[:7].tz_localize('Europe/London')\n", + "ts2 = ts1[2:].tz_convert('Europe/Moscow')\n", + "result = ts1 + ts2\n", + "result.index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Periods and Period Arithmetic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = pd.Period(2007, freq='A-DEC')\n", + "p" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p + 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p - 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.Period('2014', freq='A-DEC') - p" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.period_range('1/1/2000', '6/30/2000', freq='M')\n", + "rng" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "Series(np.random.randn(6), index=rng)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "values = ['2001Q3', '2002Q2', '2003Q1']\n", + "index = pd.PeriodIndex(values, freq='Q-DEC')\n", + "index" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Period Frequency Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = pd.Period('2007', freq='A-DEC')\n", + "p.asfreq('M', how='start')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p.asfreq('M', how='end')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = pd.Period('2007', freq='A-JUN')\n", + "p.asfreq('M', 'start')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p.asfreq('M', 'end')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = pd.Period('Aug-2007', 'M')\n", + "p.asfreq('A-JUN')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.period_range('2006', '2009', freq='A-DEC')\n", + "ts = Series(np.random.randn(len(rng)), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.asfreq('M', how='start')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.asfreq('B', how='end')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quarterly period frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p = pd.Period('2012Q4', freq='Q-JAN')\n", + "p" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p.asfreq('D', 'start')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p.asfreq('D', 'end')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60\n", + "p4pm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "p4pm.to_timestamp()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')\n", + "ts = Series(np.arange(len(rng)), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60\n", + "ts.index = new_rng.to_timestamp()\n", + "ts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Converting Timestamps to Periods (and back)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/1/2000', periods=3, freq='M')\n", + "ts = Series(randn(3), index=rng)\n", + "pts = ts.to_period()\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/29/2000', periods=6, freq='D')\n", + "ts2 = Series(randn(6), index=rng)\n", + "ts2.to_period('M')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pts = ts.to_period()\n", + "pts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pts.to_timestamp(how='end')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Creating a PeriodIndex from arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = pd.read_csv('ch08/macrodata.csv')\n", + "data.year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.quarter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC')\n", + "index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data.index = index\n", + "data.infl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Resampling and Frequency Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/1/2000', periods=100, freq='D')\n", + "ts = Series(randn(len(rng)), index=rng)\n", + "ts.resample('M', how='mean')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('M', how='mean', kind='period')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Downsampling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/1/2000', periods=12, freq='T')\n", + "ts = Series(np.arange(12), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('5min', how='sum')\n", + "# note: output changed (as the default changed from closed='right', label='right' to closed='left', label='left'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('5min', how='sum', closed='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('5min', how='sum', closed='left', label='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('5min', how='sum', loffset='-1s')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Open-High-Low-Close (OHLC) resampling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('5min', how='ohlc')\n", + "# note: output changed because of changed defaults" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Resampling with GroupBy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/1/2000', periods=100, freq='D')\n", + "ts = Series(np.arange(100), index=rng)\n", + "ts.groupby(lambda x: x.month).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.groupby(lambda x: x.weekday).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upsampling and interpolation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.random.randn(2, 4),\n", + " index=pd.date_range('1/1/2000', periods=2, freq='W-WED'),\n", + " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df_daily = frame.resample('D')\n", + "df_daily" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.resample('D', fill_method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.resample('D', fill_method='ffill', limit=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame.resample('W-THU', fill_method='ffill')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Resampling with periods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "frame = DataFrame(np.random.randn(24, 4),\n", + " index=pd.period_range('1-2000', '12-2001', freq='M'),\n", + " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", + "frame[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "annual_frame = frame.resample('A-DEC', how='mean')\n", + "annual_frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Q-DEC: Quarterly, year ending in December\n", + "annual_frame.resample('Q-DEC', fill_method='ffill')\n", + "# note: output changed, default value changed from convention='end' to convention='start' + 'start' changed to span-like\n", + "# also the following cells" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "annual_frame.resample('Q-DEC', fill_method='ffill', convention='start')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "annual_frame.resample('Q-MAR', fill_method='ffill')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Time series plotting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px_all = pd.read_csv('ch09/stock_px.csv', parse_dates=True, index_col=0)\n", + "close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]\n", + "close_px = close_px.resample('B', fill_method='ffill')\n", + "close_px.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px['AAPL'].plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px.ix['2009'].plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px['AAPL'].ix['01-2011':'03-2011'].plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "appl_q = close_px['AAPL'].resample('Q-DEC', fill_method='ffill')\n", + "appl_q.ix['2009':].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Moving window functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px = close_px.asfreq('B').fillna(method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px.AAPL.plot()\n", + "pd.rolling_mean(close_px.AAPL, 250).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "appl_std250 = pd.rolling_std(close_px.AAPL, 250, min_periods=10)\n", + "appl_std250[5:12]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "appl_std250.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Define expanding mean in terms of rolling_mean\n", + "expanding_mean = lambda x: rolling_mean(x, len(x), min_periods=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.rolling_mean(close_px, 60).plot(logy=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exponentially-weighted functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True,\n", + " figsize=(12, 7))\n", + "\n", + "aapl_px = close_px.AAPL['2005':'2009']\n", + "\n", + "ma60 = pd.rolling_mean(aapl_px, 60, min_periods=50)\n", + "ewma60 = pd.ewma(aapl_px, span=60)\n", + "\n", + "aapl_px.plot(style='k-', ax=axes[0])\n", + "ma60.plot(style='k--', ax=axes[0])\n", + "aapl_px.plot(style='k-', ax=axes[1])\n", + "ewma60.plot(style='k--', ax=axes[1])\n", + "axes[0].set_title('Simple MA')\n", + "axes[1].set_title('Exponentially-weighted MA')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Binary moving window functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px\n", + "spx_px = close_px_all['SPX']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "spx_rets = spx_px / spx_px.shift(1) - 1\n", + "returns = close_px.pct_change()\n", + "corr = pd.rolling_corr(returns.AAPL, spx_rets, 125, min_periods=100)\n", + "corr.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "corr = pd.rolling_corr(returns, spx_rets, 125, min_periods=100)\n", + "corr.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### User-defined moving window functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from scipy.stats import percentileofscore\n", + "score_at_2percent = lambda x: percentileofscore(x, 0.02)\n", + "result = pd.rolling_apply(returns.AAPL, 250, score_at_2percent)\n", + "result.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance and Memory Usage Notes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/1/2000', periods=10000000, freq='10ms')\n", + "ts = Series(np.random.randn(len(rng)), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.resample('15min', how='ohlc').info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%timeit ts.resample('15min', how='ohlc')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/1/2000', periods=10000000, freq='1s')\n", + "ts = Series(np.random.randn(len(rng)), index=rng)\n", + "%timeit ts.resample('15s', how='ohlc')" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:af14713612af6df3af27b7546e7130b724231374a9fb48b739f50bf0b51b776c" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Time series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from pandas import Series, DataFrame\n", - "import pandas as pd\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "pd.options.display.max_rows = 12\n", - "np.set_printoptions(precision=4, suppress=True)\n", - "import matplotlib.pyplot as plt\n", - "plt.rc('figure', figsize=(12, 4))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%matplotlib inline" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Date and Time Data Types and Tools" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime\n", - "now = datetime.now()\n", - "now" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "now.year, now.month, now.day" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)\n", - "delta" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "delta.days" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "delta.seconds" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import timedelta\n", - "start = datetime(2011, 1, 7)\n", - "start + timedelta(12)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "start - 2 * timedelta(12)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Converting between string and datetime" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = datetime(2011, 1, 3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "str(stamp)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp.strftime('%Y-%m-%d')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "value = '2011-01-03'\n", - "datetime.strptime(value, '%Y-%m-%d')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "datestrs = ['7/6/2011', '8/6/2011']\n", - "[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from dateutil.parser import parse\n", - "parse('2011-01-03')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "parse('Jan 31, 1997 10:45 PM')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "parse('6/12/2011', dayfirst=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "datestrs" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.to_datetime(datestrs)\n", - "# note: output changed (no '00:00:00' anymore)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "idx = pd.to_datetime(datestrs + [None])\n", - "idx" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "idx[2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.isnull(idx)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Time Series Basics" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime\n", - "dates = [datetime(2011, 1, 2), datetime(2011, 1, 5), datetime(2011, 1, 7),\n", - " datetime(2011, 1, 8), datetime(2011, 1, 10), datetime(2011, 1, 12)]\n", - "ts = Series(np.random.randn(6), index=dates)\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "type(ts)\n", - "# note: output changed to \"pandas.core.series.Series\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts + ts[::2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.index.dtype\n", - "# note: output changed from dtype('datetime64[ns]') to dtype(' to Timestamp('2011-01-02 00:00:00')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Indexing, selection, subsetting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = ts.index[2]\n", - "ts[stamp]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts['1/10/2011']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts['20110110']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "longer_ts = Series(np.random.randn(1000),\n", - " index=pd.date_range('1/1/2000', periods=1000))\n", - "longer_ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "longer_ts['2001']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "longer_ts['2001-05']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts[datetime(2011, 1, 7):]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts['1/6/2011':'1/11/2011']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.truncate(after='1/9/2011')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')\n", - "long_df = DataFrame(np.random.randn(100, 4),\n", - " index=dates,\n", - " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", - "long_df.ix['5-2001']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Time series with duplicate indices" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000', '1/2/2000',\n", - " '1/3/2000'])\n", - "dup_ts = Series(np.arange(5), index=dates)\n", - "dup_ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dup_ts.index.is_unique" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dup_ts['1/3/2000'] # not duplicated" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dup_ts['1/2/2000'] # duplicated" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = dup_ts.groupby(level=0)\n", - "grouped.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Date ranges, Frequencies, and Shifting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('D')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Generating date ranges" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "index = pd.date_range('4/1/2012', '6/1/2012')\n", - "index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range(start='4/1/2012', periods=20)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range(end='6/1/2012', periods=20)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('1/1/2000', '12/1/2000', freq='BM')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('5/2/2012 12:56:31', periods=5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('5/2/2012 12:56:31', periods=5, normalize=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Frequencies and Date Offsets" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas.tseries.offsets import Hour, Minute\n", - "hour = Hour()\n", - "hour" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "four_hours = Hour(4)\n", - "four_hours" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('1/1/2000', '1/3/2000 23:59', freq='4h')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "Hour(2) + Minute(30)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('1/1/2000', periods=10, freq='1h30min')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Week of month dates" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/1/2012', '9/1/2012', freq='WOM-3FRI')\n", - "list(rng)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Shifting (leading and lagging) data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts = Series(np.random.randn(4),\n", - " index=pd.date_range('1/1/2000', periods=4, freq='M'))\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(-2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ts / ts.shift(1) - 1" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(2, freq='M')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(3, freq='D')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(1, freq='3D')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(1, freq='90T')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Shifting dates with offsets" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas.tseries.offsets import Day, MonthEnd\n", - "now = datetime(2011, 11, 17)\n", - "now + 3 * Day()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "now + MonthEnd()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "now + MonthEnd(2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "offset = MonthEnd()\n", - "offset.rollforward(now)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "offset.rollback(now)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts = Series(np.random.randn(20),\n", - " index=pd.date_range('1/15/2000', periods=20, freq='4d'))\n", - "ts.groupby(offset.rollforward).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('M', how='mean')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Time Zone Handling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pytz\n", - "pytz.common_timezones[-5:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tz = pytz.timezone('US/Eastern')\n", - "tz" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Localization and Conversion" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')\n", - "ts = Series(np.random.randn(len(rng)), index=rng)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(ts.index.tz)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts_utc = ts.tz_localize('UTC')\n", - "ts_utc" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts_utc.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts_utc.tz_convert('US/Eastern')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts_eastern = ts.tz_localize('US/Eastern')\n", - "ts_eastern.tz_convert('UTC')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts_eastern.tz_convert('Europe/Berlin')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.index.tz_localize('Asia/Shanghai')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Operations with time zone-aware Timestamp objects" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = pd.Timestamp('2011-03-12 04:00')\n", - "stamp_utc = stamp.tz_localize('utc')\n", - "stamp_utc.tz_convert('US/Eastern')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')\n", - "stamp_moscow" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp_utc.value" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp_utc.tz_convert('US/Eastern').value" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 30 minutes before DST transition\n", - "from pandas.tseries.offsets import Hour\n", - "stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')\n", - "stamp" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp + Hour()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 90 minutes before DST transition\n", - "stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')\n", - "stamp" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp + 2 * Hour()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Operations between different time zones" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')\n", - "ts = Series(np.random.randn(len(rng)), index=rng)\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts1 = ts[:7].tz_localize('Europe/London')\n", - "ts2 = ts1[2:].tz_convert('Europe/Moscow')\n", - "result = ts1 + ts2\n", - "result.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Periods and Period Arithmetic" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period(2007, freq='A-DEC')\n", - "p" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p + 5" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p - 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.Period('2014', freq='A-DEC') - p" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.period_range('1/1/2000', '6/30/2000', freq='M')\n", - "rng" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "Series(np.random.randn(6), index=rng)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = ['2001Q3', '2002Q2', '2003Q1']\n", - "index = pd.PeriodIndex(values, freq='Q-DEC')\n", - "index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Period Frequency Conversion" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('2007', freq='A-DEC')\n", - "p.asfreq('M', how='start')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p.asfreq('M', how='end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('2007', freq='A-JUN')\n", - "p.asfreq('M', 'start')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p.asfreq('M', 'end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('Aug-2007', 'M')\n", - "p.asfreq('A-JUN')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.period_range('2006', '2009', freq='A-DEC')\n", - "ts = Series(np.random.randn(len(rng)), index=rng)\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.asfreq('M', how='start')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.asfreq('B', how='end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Quarterly period frequencies" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('2012Q4', freq='Q-JAN')\n", - "p" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p.asfreq('D', 'start')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p.asfreq('D', 'end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60\n", - "p4pm" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p4pm.to_timestamp()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')\n", - "ts = Series(np.arange(len(rng)), index=rng)\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60\n", - "ts.index = new_rng.to_timestamp()\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Converting Timestamps to Periods (and back)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/1/2000', periods=3, freq='M')\n", - "ts = Series(randn(3), index=rng)\n", - "pts = ts.to_period()\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/29/2000', periods=6, freq='D')\n", - "ts2 = Series(randn(6), index=rng)\n", - "ts2.to_period('M')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pts = ts.to_period()\n", - "pts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pts.to_timestamp(how='end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Creating a PeriodIndex from arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_csv('ch08/macrodata.csv')\n", - "data.year" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.quarter" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "index = pd.PeriodIndex(year=data.year, quarter=data.quarter, freq='Q-DEC')\n", - "index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.index = index\n", - "data.infl" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Resampling and Frequency Conversion" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/1/2000', periods=100, freq='D')\n", - "ts = Series(randn(len(rng)), index=rng)\n", - "ts.resample('M', how='mean')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('M', how='mean', kind='period')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Downsampling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/1/2000', periods=12, freq='T')\n", - "ts = Series(np.arange(12), index=rng)\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', how='sum')\n", - "# note: output changed (as the default changed from closed='right', label='right' to closed='left', label='left'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', how='sum', closed='left')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', how='sum', closed='left', label='left')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', how='sum', loffset='-1s')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Open-High-Low-Close (OHLC) resampling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', how='ohlc')\n", - "# note: output changed because of changed defaults" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Resampling with GroupBy" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/1/2000', periods=100, freq='D')\n", - "ts = Series(np.arange(100), index=rng)\n", - "ts.groupby(lambda x: x.month).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.groupby(lambda x: x.weekday).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Upsampling and interpolation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.random.randn(2, 4),\n", - " index=pd.date_range('1/1/2000', periods=2, freq='W-WED'),\n", - " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df_daily = frame.resample('D')\n", - "df_daily" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.resample('D', fill_method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.resample('D', fill_method='ffill', limit=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.resample('W-THU', fill_method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Resampling with periods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = DataFrame(np.random.randn(24, 4),\n", - " index=pd.period_range('1-2000', '12-2001', freq='M'),\n", - " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", - "frame[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "annual_frame = frame.resample('A-DEC', how='mean')\n", - "annual_frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Q-DEC: Quarterly, year ending in December\n", - "annual_frame.resample('Q-DEC', fill_method='ffill')\n", - "# note: output changed, default value changed from convention='end' to convention='start' + 'start' changed to span-like\n", - "# also the following cells" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "annual_frame.resample('Q-DEC', fill_method='ffill', convention='start')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "annual_frame.resample('Q-MAR', fill_method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Time series plotting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px_all = pd.read_csv('ch09/stock_px.csv', parse_dates=True, index_col=0)\n", - "close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]\n", - "close_px = close_px.resample('B', fill_method='ffill')\n", - "close_px.info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px['AAPL'].plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px.ix['2009'].plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px['AAPL'].ix['01-2011':'03-2011'].plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "appl_q = close_px['AAPL'].resample('Q-DEC', fill_method='ffill')\n", - "appl_q.ix['2009':].plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Moving window functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px = close_px.asfreq('B').fillna(method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px.AAPL.plot()\n", - "pd.rolling_mean(close_px.AAPL, 250).plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "appl_std250 = pd.rolling_std(close_px.AAPL, 250, min_periods=10)\n", - "appl_std250[5:12]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "appl_std250.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Define expanding mean in terms of rolling_mean\n", - "expanding_mean = lambda x: rolling_mean(x, len(x), min_periods=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.rolling_mean(close_px, 60).plot(logy=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Exponentially-weighted functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True, sharey=True,\n", - " figsize=(12, 7))\n", - "\n", - "aapl_px = close_px.AAPL['2005':'2009']\n", - "\n", - "ma60 = pd.rolling_mean(aapl_px, 60, min_periods=50)\n", - "ewma60 = pd.ewma(aapl_px, span=60)\n", - "\n", - "aapl_px.plot(style='k-', ax=axes[0])\n", - "ma60.plot(style='k--', ax=axes[0])\n", - "aapl_px.plot(style='k-', ax=axes[1])\n", - "ewma60.plot(style='k--', ax=axes[1])\n", - "axes[0].set_title('Simple MA')\n", - "axes[1].set_title('Exponentially-weighted MA')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Binary moving window functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px\n", - "spx_px = close_px_all['SPX']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "spx_rets = spx_px / spx_px.shift(1) - 1\n", - "returns = close_px.pct_change()\n", - "corr = pd.rolling_corr(returns.AAPL, spx_rets, 125, min_periods=100)\n", - "corr.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "corr = pd.rolling_corr(returns, spx_rets, 125, min_periods=100)\n", - "corr.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "User-defined moving window functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from scipy.stats import percentileofscore\n", - "score_at_2percent = lambda x: percentileofscore(x, 0.02)\n", - "result = pd.rolling_apply(returns.AAPL, 250, score_at_2percent)\n", - "result.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Performance and Memory Usage Notes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/1/2000', periods=10000000, freq='10ms')\n", - "ts = Series(np.random.randn(len(rng)), index=rng)\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('15min', how='ohlc').info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%timeit ts.resample('15min', how='ohlc')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/1/2000', periods=10000000, freq='1s')\n", - "ts = Series(np.random.randn(len(rng)), index=rng)\n", - "%timeit ts.resample('15s', how='ohlc')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From a46b3e6806b3ea3e5dd03d58e62961a19de2423f Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 6 Nov 2015 14:22:55 -0800 Subject: [PATCH 8/9] Run through ch11 --- ch11.ipynb | 2558 +++++++++++++++++++++++++++------------------------- 1 file changed, 1316 insertions(+), 1242 deletions(-) diff --git a/ch11.ipynb b/ch11.ipynb index b97b303fd..d9568de78 100644 --- a/ch11.ipynb +++ b/ch11.ipynb @@ -1,1245 +1,1319 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Financial and Economic Data Applications" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from pandas import Series, DataFrame\n", + "import pandas as pd\n", + "from numpy.random import randn\n", + "import numpy as np\n", + "pd.options.display.max_rows = 12\n", + "np.set_printoptions(precision=4, suppress=True)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(12, 6))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pwd" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data munging topics" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Time series and cross-section alignment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "close_px = pd.read_csv('ch11/stock_px.csv', parse_dates=True, index_col=0)\n", + "volume = pd.read_csv('ch11/volume.csv', parse_dates=True, index_col=0)\n", + "prices = close_px.ix['2011-09-05':'2011-09-14', ['AAPL', 'JNJ', 'SPX', 'XOM']]\n", + "volume = volume.ix['2011-09-05':'2011-09-12', ['AAPL', 'JNJ', 'XOM']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "volume" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prices * volume" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vwap = (prices * volume).sum() / volume.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vwap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vwap.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prices.align(volume, join='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "s1 = Series(range(3), index=['a', 'b', 'c'])\n", + "s2 = Series(range(4), index=['d', 'b', 'c', 'e'])\n", + "s3 = Series(range(3), index=['f', 'a', 'c'])\n", + "DataFrame({'one': s1, 'two': s2, 'three': s3})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Operations with time series of different frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts1 = Series(np.random.randn(3),\n", + " index=pd.date_range('2012-6-13', periods=3, freq='W-WED'))\n", + "ts1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts1.resample('B')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts1.resample('B', fill_method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18',\n", + " '2012-6-21', '2012-6-22', '2012-6-29'])\n", + "ts2 = Series(np.random.randn(6), index=dates)\n", + "ts2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts1.reindex(ts2.index, method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts2 + ts1.reindex(ts2.index, method='ffill')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Using periods instead of timestamps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],\n", + " index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))\n", + "infl = Series([0.025, 0.045, 0.037, 0.04],\n", + " index=pd.period_range('1982', periods=4, freq='A-DEC'))\n", + "gdp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "infl" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "infl_q = infl.asfreq('Q-SEP', how='end')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "infl_q" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "infl_q.reindex(gdp.index, method='ffill')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Time of day and \"as of\" data selection" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Make an intraday date range and time series\n", + "rng = pd.date_range('2012-06-01 09:30', '2012-06-01 15:59', freq='T')\n", + "# Make a 5-day series of 9:30-15:59 values\n", + "rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1, 4)])\n", + "ts = Series(np.arange(len(rng), dtype=float), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from datetime import time\n", + "ts[time(10, 0)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.at_time(time(10, 0))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ts.between_time(time(10, 0), time(10, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.random.seed(12346)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Set most of the time series randomly to NA\n", + "indexer = np.sort(np.random.permutation(len(ts))[700:])\n", + "irr_ts = ts.copy()\n", + "irr_ts[indexer] = np.nan\n", + "irr_ts['2012-06-01 09:50':'2012-06-01 10:00']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "selection = pd.date_range('2012-06-01 10:00', periods=4, freq='B')\n", + "irr_ts.asof(selection)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Splicing together data sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data1 = DataFrame(np.ones((6, 3), dtype=float),\n", + " columns=['a', 'b', 'c'],\n", + " index=pd.date_range('6/12/2012', periods=6))\n", + "data2 = DataFrame(np.ones((6, 3), dtype=float) * 2,\n", + " columns=['a', 'b', 'c'],\n", + " index=pd.date_range('6/13/2012', periods=6))\n", + "spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]])\n", + "spliced" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data2 = DataFrame(np.ones((6, 4), dtype=float) * 2,\n", + " columns=['a', 'b', 'c', 'd'],\n", + " index=pd.date_range('6/13/2012', periods=6))\n", + "spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]])\n", + "spliced" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "spliced_filled = spliced.combine_first(data2)\n", + "spliced_filled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "spliced.update(data2, overwrite=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "spliced" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cp_spliced = spliced.copy()\n", + "cp_spliced[['a', 'c']] = data1[['a', 'c']]\n", + "cp_spliced" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Return indexes and cumulative returns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas.io.data as web\n", + "price = web.get_data_yahoo('AAPL', '2011-01-01')['Adj Close']\n", + "price[-5:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "price['2011-10-03'] / price['2011-3-01'] - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "returns = price.pct_change()\n", + "ret_index = (1 + returns).cumprod()\n", + "ret_index[0] = 1 # Set first value to 1\n", + "ret_index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "m_returns = ret_index.resample('BM', how='last').pct_change()\n", + "m_returns['2012']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "m_rets = (1 + returns).resample('M', how='prod', kind='period') - 1\n", + "m_rets['2012']" + ] + }, + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "returns[dividend_dates] += dividend_pcts" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Group transforms and analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = 100\n", + "pd.options.display.max_columns = 10\n", + "np.random.seed(12345)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import random; random.seed(0)\n", + "import string\n", + "\n", + "N = 1000\n", + "def rands(n):\n", + " choices = string.ascii_uppercase\n", + " return ''.join([random.choice(choices) for _ in xrange(n)])\n", + "tickers = np.array([rands(5) for _ in xrange(N)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "M = 500\n", + "df = DataFrame({'Momentum' : np.random.randn(M) / 200 + 0.03,\n", + " 'Value' : np.random.randn(M) / 200 + 0.08,\n", + " 'ShortInterest' : np.random.randn(M) / 200 - 0.02},\n", + " index=tickers[:M])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ind_names = np.array(['FINANCIAL', 'TECH'])\n", + "sampler = np.random.randint(0, len(ind_names), N)\n", + "industries = Series(ind_names[sampler], index=tickers,\n", + " name='industry')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_industry = df.groupby(industries)\n", + "by_industry.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_industry.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Within-Industry Standardize\n", + "def zscore(group):\n", + " return (group - group.mean()) / group.std()\n", + "\n", + "df_stand = by_industry.apply(zscore)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "df_stand.groupby(industries).agg(['mean', 'std'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Within-industry rank descending\n", + "ind_rank = by_industry.rank(ascending=False)\n", + "ind_rank.groupby(industries).agg(['min', 'max'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Industry rank and standardize\n", + "by_industry.apply(lambda x: zscore(x.rank()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Group factor exposures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from numpy.random import rand\n", + "fac1, fac2, fac3 = np.random.rand(3, 1000)\n", + "\n", + "ticker_subset = tickers.take(np.random.permutation(N)[:1000])\n", + "\n", + "# Weighted sum of factors plus noise\n", + "port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + rand(1000),\n", + " index=ticker_subset)\n", + "factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3},\n", + " index=ticker_subset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "factors.corrwith(port)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.ols(y=port, x=factors).beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def beta_exposure(chunk, factors=None):\n", + " return pd.ols(y=chunk, x=factors).beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "by_ind = port.groupby(industries)\n", + "exposures = by_ind.apply(beta_exposure, factors=factors)\n", + "exposures.unstack()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Decile and quartile analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas.io.data as web\n", + "data = web.get_data_yahoo('SPY', '2006-01-01')\n", + "data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "px = data['Adj Close']\n", + "returns = px.pct_change()\n", + "\n", + "def to_index(rets):\n", + " index = (1 + rets).cumprod()\n", + " first_loc = max(index.index.get_loc(index.idxmax()) - 1, 0)\n", + " index.values[first_loc] = 1\n", + " return index\n", + "\n", + "def trend_signal(rets, lookback, lag):\n", + " signal = pd.rolling_sum(rets, lookback, min_periods=lookback - 5)\n", + " return signal.shift(lag)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "signal = trend_signal(returns, 100, 3)\n", + "trade_friday = signal.resample('W-FRI').resample('B', fill_method='ffill')\n", + "trade_rets = trade_friday.shift(1) * returns\n", + "trade_rets = trade_rets[:len(returns)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "to_index(trade_rets).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vol = pd.rolling_std(returns, 250, min_periods=200) * np.sqrt(250)\n", + "\n", + "def sharpe(rets, ann=250):\n", + " return rets.mean() / rets.std() * np.sqrt(ann)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "cats = pd.qcut(vol, 4)\n", + "print('cats: %d, trade_rets: %d, vol: %d' % (len(cats), len(trade_rets), len(vol)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "trade_rets.groupby(cats).agg(sharpe)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More example applications" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Signal frontier analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names = ['AAPL', 'GOOG', 'MSFT', 'DELL', 'GS', 'MS', 'BAC', 'C']\n", + "def get_px(stock, start, end):\n", + " return web.get_data_yahoo(stock, start, end)['Adj Close']\n", + "px = DataFrame({n: get_px(n, None, None) for n in names})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#px = pd.read_csv('ch11/stock_px.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "px = px.asfreq('B').fillna(method='pad')\n", + "rets = px.pct_change()\n", + "((1 + rets).cumprod() - 1).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def calc_mom(price, lookback, lag):\n", + " mom_ret = price.shift(lag).pct_change(lookback)\n", + " ranks = mom_ret.rank(axis=1, ascending=False)\n", + " demeaned = ranks.subtract(ranks.mean(axis=1), axis=0)\n", + " return demeaned.divide(demeaned.std(axis=1), axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "compound = lambda x : (1 + x).prod() - 1\n", + "daily_sr = lambda x: x.mean() / x.std()\n", + "\n", + "def strat_sr(prices, lb, hold):\n", + " # Compute portfolio weights\n", + " freq = '%dB' % hold\n", + " port = calc_mom(prices, lb, lag=1)\n", + "\n", + " daily_rets = prices.pct_change()\n", + "\n", + " # Compute portfolio returns\n", + " port = port.shift(1).resample(freq, how='first')\n", + " returns = daily_rets.resample(freq, how=compound)\n", + " port_rets = (port * returns).sum(axis=1)\n", + "\n", + " return daily_sr(port_rets) * np.sqrt(252 / hold)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "strat_sr(px, 70, 30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "lookbacks = range(20, 90, 5)\n", + "holdings = range(20, 90, 5)\n", + "dd = defaultdict(dict)\n", + "for lb in lookbacks:\n", + " for hold in holdings:\n", + " dd[lb][hold] = strat_sr(px, lb, hold)\n", + "\n", + "ddf = DataFrame(dd)\n", + "ddf.index.name = 'Holding Period'\n", + "ddf.columns.name = 'Lookback Period'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "def heatmap(df, cmap=plt.cm.gray_r):\n", + " fig = plt.figure()\n", + " ax = fig.add_subplot(111)\n", + " axim = ax.imshow(df.values, cmap=cmap, interpolation='nearest')\n", + " ax.set_xlabel(df.columns.name)\n", + " ax.set_xticks(np.arange(len(df.columns)))\n", + " ax.set_xticklabels(list(df.columns))\n", + " ax.set_ylabel(df.index.name)\n", + " ax.set_yticks(np.arange(len(df.index)))\n", + " ax.set_yticklabels(list(df.index))\n", + " plt.colorbar(axim)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "heatmap(ddf)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Future contract rolling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas.io.data as web\n", + "# Approximate price of S&P 500 index\n", + "px = web.get_data_yahoo('SPY')['Adj Close'] * 10\n", + "px" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "expiry = {'ESU2': datetime(2012, 9, 21),\n", + " 'ESZ2': datetime(2012, 12, 21)}\n", + "expiry = Series(expiry).order()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "expiry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.random.seed(12347)\n", + "N = 200\n", + "walk = (np.random.randint(0, 200, size=N) - 100) * 0.25\n", + "perturb = (np.random.randint(0, 20, size=N) - 10) * 0.25\n", + "walk = walk.cumsum()\n", + "\n", + "rng = pd.date_range(px.index[0], periods=len(px) + N, freq='B')\n", + "near = np.concatenate([px.values, px.values[-1] + walk])\n", + "far = np.concatenate([px.values, px.values[-1] + walk + perturb])\n", + "prices = DataFrame({'ESU2': near, 'ESZ2': far}, index=rng)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "prices.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def get_roll_weights(start, expiry, items, roll_periods=5):\n", + " # start : first date to compute weighting DataFrame\n", + " # expiry : Series of ticker -> expiration dates\n", + " # items : sequence of contract names\n", + "\n", + " dates = pd.date_range(start, expiry[-1], freq='B')\n", + " weights = DataFrame(np.zeros((len(dates), len(items))),\n", + " index=dates, columns=items)\n", + "\n", + " prev_date = weights.index[0]\n", + " for i, (item, ex_date) in enumerate(expiry.iteritems()):\n", + " if i < len(expiry) - 1:\n", + " weights.ix[prev_date:ex_date - pd.offsets.BDay(), item] = 1\n", + " roll_rng = pd.date_range(end=ex_date - pd.offsets.BDay(),\n", + " periods=roll_periods + 1, freq='B')\n", + "\n", + " decay_weights = np.linspace(0, 1, roll_periods + 1)\n", + " weights.ix[roll_rng, item] = 1 - decay_weights\n", + " weights.ix[roll_rng, expiry.index[i + 1]] = decay_weights\n", + " else:\n", + " weights.ix[prev_date:, item] = 1\n", + "\n", + " prev_date = ex_date\n", + "\n", + " return weights" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "weights = get_roll_weights('6/1/2012', expiry, prices.columns)\n", + "weights.ix['2012-09-12':'2012-09-21']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "rolled_returns = (prices.pct_change() * weights).sum(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Rolling correlation and linear regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "aapl = web.get_data_yahoo('AAPL', '2000-01-01')['Adj Close']\n", + "msft = web.get_data_yahoo('MSFT', '2000-01-01')['Adj Close']\n", + "\n", + "aapl_rets = aapl.pct_change()\n", + "msft_rets = msft.pct_change()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "pd.rolling_corr(aapl_rets, msft_rets, 250).plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model = pd.ols(y=aapl_rets, x={'MSFT': msft_rets}, window=250)\n", + "model.beta" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model.beta['MSFT'].plot()" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:a9eebd34b149c3c030b4256d857ef5171f5a0b76224da80d6620b3050ae86364" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Financial and Economic Data Applications" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from pandas import Series, DataFrame\n", - "import pandas as pd\n", - "from numpy.random import randn\n", - "import numpy as np\n", - "pd.options.display.max_rows = 12\n", - "np.set_printoptions(precision=4, suppress=True)\n", - "import matplotlib.pyplot as plt\n", - "plt.rc('figure', figsize=(12, 6))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%matplotlib inline" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pwd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%cd ../book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data munging topics" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Time series and cross-section alignment" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px = pd.read_csv('ch11/stock_px.csv', parse_dates=True, index_col=0)\n", - "volume = pd.read_csv('ch11/volume.csv', parse_dates=True, index_col=0)\n", - "prices = close_px.ix['2011-09-05':'2011-09-14', ['AAPL', 'JNJ', 'SPX', 'XOM']]\n", - "volume = volume.ix['2011-09-05':'2011-09-12', ['AAPL', 'JNJ', 'XOM']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "prices" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "volume" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "prices * volume" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "vwap = (prices * volume).sum() / volume.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "vwap" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "vwap.dropna()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "prices.align(volume, join='inner')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 = Series(range(3), index=['a', 'b', 'c'])\n", - "s2 = Series(range(4), index=['d', 'b', 'c', 'e'])\n", - "s3 = Series(range(3), index=['f', 'a', 'c'])\n", - "DataFrame({'one': s1, 'two': s2, 'three': s3})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "DataFrame({'one': s1, 'two': s2, 'three': s3}, index=list('face'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Operations with time series of different frequencies" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts1 = Series(np.random.randn(3),\n", - " index=pd.date_range('2012-6-13', periods=3, freq='W-WED'))\n", - "ts1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts1.resample('B')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts1.resample('B', fill_method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dates = pd.DatetimeIndex(['2012-6-12', '2012-6-17', '2012-6-18',\n", - " '2012-6-21', '2012-6-22', '2012-6-29'])\n", - "ts2 = Series(np.random.randn(6), index=dates)\n", - "ts2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts1.reindex(ts2.index, method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts2 + ts1.reindex(ts2.index, method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Using periods instead of timestamps" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "gdp = Series([1.78, 1.94, 2.08, 2.01, 2.15, 2.31, 2.46],\n", - " index=pd.period_range('1984Q2', periods=7, freq='Q-SEP'))\n", - "infl = Series([0.025, 0.045, 0.037, 0.04],\n", - " index=pd.period_range('1982', periods=4, freq='A-DEC'))\n", - "gdp" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "infl" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "infl_q = infl.asfreq('Q-SEP', how='end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "infl_q" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "infl_q.reindex(gdp.index, method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Time of day and \"as of\" data selection" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Make an intraday date range and time series\n", - "rng = pd.date_range('2012-06-01 09:30', '2012-06-01 15:59', freq='T')\n", - "# Make a 5-day series of 9:30-15:59 values\n", - "rng = rng.append([rng + pd.offsets.BDay(i) for i in range(1, 4)])\n", - "ts = Series(np.arange(len(rng), dtype=float), index=rng)\n", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import time\n", - "ts[time(10, 0)]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.at_time(time(10, 0))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.between_time(time(10, 0), time(10, 1))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12346)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Set most of the time series randomly to NA\n", - "indexer = np.sort(np.random.permutation(len(ts))[700:])\n", - "irr_ts = ts.copy()\n", - "irr_ts[indexer] = np.nan\n", - "irr_ts['2012-06-01 09:50':'2012-06-01 10:00']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "selection = pd.date_range('2012-06-01 10:00', periods=4, freq='B')\n", - "irr_ts.asof(selection)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Splicing together data sources" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data1 = DataFrame(np.ones((6, 3), dtype=float),\n", - " columns=['a', 'b', 'c'],\n", - " index=pd.date_range('6/12/2012', periods=6))\n", - "data2 = DataFrame(np.ones((6, 3), dtype=float) * 2,\n", - " columns=['a', 'b', 'c'],\n", - " index=pd.date_range('6/13/2012', periods=6))\n", - "spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]])\n", - "spliced" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data2 = DataFrame(np.ones((6, 4), dtype=float) * 2,\n", - " columns=['a', 'b', 'c', 'd'],\n", - " index=pd.date_range('6/13/2012', periods=6))\n", - "spliced = pd.concat([data1.ix[:'2012-06-14'], data2.ix['2012-06-15':]])\n", - "spliced" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "spliced_filled = spliced.combine_first(data2)\n", - "spliced_filled" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "spliced.update(data2, overwrite=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "spliced" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cp_spliced = spliced.copy()\n", - "cp_spliced[['a', 'c']] = data1[['a', 'c']]\n", - "cp_spliced" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Return indexes and cumulative returns" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas.io.data as web\n", - "price = web.get_data_yahoo('AAPL', '2011-01-01')['Adj Close']\n", - "price[-5:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "price['2011-10-03'] / price['2011-3-01'] - 1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns = price.pct_change()\n", - "ret_index = (1 + returns).cumprod()\n", - "ret_index[0] = 1 # Set first value to 1\n", - "ret_index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "m_returns = ret_index.resample('BM', how='last').pct_change()\n", - "m_returns['2012']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "m_rets = (1 + returns).resample('M', how='prod', kind='period') - 1\n", - "m_rets['2012']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "returns[dividend_dates] += dividend_pcts" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Group transforms and analysis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = 100\n", - "pd.options.display.max_columns = 10\n", - "np.random.seed(12345)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import random; random.seed(0)\n", - "import string\n", - "\n", - "N = 1000\n", - "def rands(n):\n", - " choices = string.ascii_uppercase\n", - " return ''.join([random.choice(choices) for _ in xrange(n)])\n", - "tickers = np.array([rands(5) for _ in xrange(N)])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "M = 500\n", - "df = DataFrame({'Momentum' : np.random.randn(M) / 200 + 0.03,\n", - " 'Value' : np.random.randn(M) / 200 + 0.08,\n", - " 'ShortInterest' : np.random.randn(M) / 200 - 0.02},\n", - " index=tickers[:M])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ind_names = np.array(['FINANCIAL', 'TECH'])\n", - "sampler = np.random.randint(0, len(ind_names), N)\n", - "industries = Series(ind_names[sampler], index=tickers,\n", - " name='industry')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_industry = df.groupby(industries)\n", - "by_industry.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_industry.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Within-Industry Standardize\n", - "def zscore(group):\n", - " return (group - group.mean()) / group.std()\n", - "\n", - "df_stand = by_industry.apply(zscore)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df_stand.groupby(industries).agg(['mean', 'std'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Within-industry rank descending\n", - "ind_rank = by_industry.rank(ascending=False)\n", - "ind_rank.groupby(industries).agg(['min', 'max'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Industry rank and standardize\n", - "by_industry.apply(lambda x: zscore(x.rank()))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Group factor exposures" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy.random import rand\n", - "fac1, fac2, fac3 = np.random.rand(3, 1000)\n", - "\n", - "ticker_subset = tickers.take(np.random.permutation(N)[:1000])\n", - "\n", - "# Weighted sum of factors plus noise\n", - "port = Series(0.7 * fac1 - 1.2 * fac2 + 0.3 * fac3 + rand(1000),\n", - " index=ticker_subset)\n", - "factors = DataFrame({'f1': fac1, 'f2': fac2, 'f3': fac3},\n", - " index=ticker_subset)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "factors.corrwith(port)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.ols(y=port, x=factors).beta" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def beta_exposure(chunk, factors=None):\n", - " return pd.ols(y=chunk, x=factors).beta" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_ind = port.groupby(industries)\n", - "exposures = by_ind.apply(beta_exposure, factors=factors)\n", - "exposures.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Decile and quartile analysis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas.io.data as web\n", - "data = web.get_data_yahoo('SPY', '2006-01-01')\n", - "data.info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "px = data['Adj Close']\n", - "returns = px.pct_change()\n", - "\n", - "def to_index(rets):\n", - " index = (1 + rets).cumprod()\n", - " first_loc = max(index.index.get_loc(index.idxmax()) - 1, 0)\n", - " index.values[first_loc] = 1\n", - " return index\n", - "\n", - "def trend_signal(rets, lookback, lag):\n", - " signal = pd.rolling_sum(rets, lookback, min_periods=lookback - 5)\n", - " return signal.shift(lag)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "signal = trend_signal(returns, 100, 3)\n", - "trade_friday = signal.resample('W-FRI').resample('B', fill_method='ffill')\n", - "trade_rets = trade_friday.shift(1) * returns\n", - "trade_rets = trade_rets[:len(returns)]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "to_index(trade_rets).plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "vol = pd.rolling_std(returns, 250, min_periods=200) * np.sqrt(250)\n", - "\n", - "def sharpe(rets, ann=250):\n", - " return rets.mean() / rets.std() * np.sqrt(ann)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cats = pd.qcut(vol, 4)\n", - "print('cats: %d, trade_rets: %d, vol: %d' % (len(cats), len(trade_rets), len(vol)))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "trade_rets.groupby(cats).agg(sharpe)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "More example applications" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Signal frontier analysis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names = ['AAPL', 'GOOG', 'MSFT', 'DELL', 'GS', 'MS', 'BAC', 'C']\n", - "def get_px(stock, start, end):\n", - " return web.get_data_yahoo(stock, start, end)['Adj Close']\n", - "px = DataFrame({n: get_px(n, None, None) for n in names})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "px = pd.read_pickle('notebooks/stock_prices')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "px = px.asfreq('B').fillna(method='pad')\n", - "rets = px.pct_change()\n", - "((1 + rets).cumprod() - 1).plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def calc_mom(price, lookback, lag):\n", - " mom_ret = price.shift(lag).pct_change(lookback)\n", - " ranks = mom_ret.rank(axis=1, ascending=False)\n", - " demeaned = ranks.subtract(ranks.mean(axis=1), axis=0)\n", - " return demeaned.divide(demeaned.std(axis=1), axis=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "compound = lambda x : (1 + x).prod() - 1\n", - "daily_sr = lambda x: x.mean() / x.std()\n", - "\n", - "def strat_sr(prices, lb, hold):\n", - " # Compute portfolio weights\n", - " freq = '%dB' % hold\n", - " port = calc_mom(prices, lb, lag=1)\n", - "\n", - " daily_rets = prices.pct_change()\n", - "\n", - " # Compute portfolio returns\n", - " port = port.shift(1).resample(freq, how='first')\n", - " returns = daily_rets.resample(freq, how=compound)\n", - " port_rets = (port * returns).sum(axis=1)\n", - "\n", - " return daily_sr(port_rets) * np.sqrt(252 / hold)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "strat_sr(px, 70, 30)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from collections import defaultdict\n", - "\n", - "lookbacks = range(20, 90, 5)\n", - "holdings = range(20, 90, 5)\n", - "dd = defaultdict(dict)\n", - "for lb in lookbacks:\n", - " for hold in holdings:\n", - " dd[lb][hold] = strat_sr(px, lb, hold)\n", - "\n", - "ddf = DataFrame(dd)\n", - "ddf.index.name = 'Holding Period'\n", - "ddf.columns.name = 'Lookback Period'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import matplotlib.pyplot as plt\n", - "\n", - "def heatmap(df, cmap=plt.cm.gray_r):\n", - " fig = plt.figure()\n", - " ax = fig.add_subplot(111)\n", - " axim = ax.imshow(df.values, cmap=cmap, interpolation='nearest')\n", - " ax.set_xlabel(df.columns.name)\n", - " ax.set_xticks(np.arange(len(df.columns)))\n", - " ax.set_xticklabels(list(df.columns))\n", - " ax.set_ylabel(df.index.name)\n", - " ax.set_yticks(np.arange(len(df.index)))\n", - " ax.set_yticklabels(list(df.index))\n", - " plt.colorbar(axim)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "heatmap(ddf)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Future contract rolling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = 10" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas.io.data as web\n", - "# Approximate price of S&P 500 index\n", - "px = web.get_data_yahoo('SPY')['Adj Close'] * 10\n", - "px" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime\n", - "expiry = {'ESU2': datetime(2012, 9, 21),\n", - " 'ESZ2': datetime(2012, 12, 21)}\n", - "expiry = Series(expiry).order()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "expiry" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12347)\n", - "N = 200\n", - "walk = (np.random.randint(0, 200, size=N) - 100) * 0.25\n", - "perturb = (np.random.randint(0, 20, size=N) - 10) * 0.25\n", - "walk = walk.cumsum()\n", - "\n", - "rng = pd.date_range(px.index[0], periods=len(px) + N, freq='B')\n", - "near = np.concatenate([px.values, px.values[-1] + walk])\n", - "far = np.concatenate([px.values, px.values[-1] + walk + perturb])\n", - "prices = DataFrame({'ESU2': near, 'ESZ2': far}, index=rng)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "prices.tail()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_roll_weights(start, expiry, items, roll_periods=5):\n", - " # start : first date to compute weighting DataFrame\n", - " # expiry : Series of ticker -> expiration dates\n", - " # items : sequence of contract names\n", - "\n", - " dates = pd.date_range(start, expiry[-1], freq='B')\n", - " weights = DataFrame(np.zeros((len(dates), len(items))),\n", - " index=dates, columns=items)\n", - "\n", - " prev_date = weights.index[0]\n", - " for i, (item, ex_date) in enumerate(expiry.iteritems()):\n", - " if i < len(expiry) - 1:\n", - " weights.ix[prev_date:ex_date - pd.offsets.BDay(), item] = 1\n", - " roll_rng = pd.date_range(end=ex_date - pd.offsets.BDay(),\n", - " periods=roll_periods + 1, freq='B')\n", - "\n", - " decay_weights = np.linspace(0, 1, roll_periods + 1)\n", - " weights.ix[roll_rng, item] = 1 - decay_weights\n", - " weights.ix[roll_rng, expiry.index[i + 1]] = decay_weights\n", - " else:\n", - " weights.ix[prev_date:, item] = 1\n", - "\n", - " prev_date = ex_date\n", - "\n", - " return weights" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "weights = get_roll_weights('6/1/2012', expiry, prices.columns)\n", - "weights.ix['2012-09-12':'2012-09-21']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rolled_returns = (prices.pct_change() * weights).sum(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Rolling correlation and linear regression" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "aapl = web.get_data_yahoo('AAPL', '2000-01-01')['Adj Close']\n", - "msft = web.get_data_yahoo('MSFT', '2000-01-01')['Adj Close']\n", - "\n", - "aapl_rets = aapl.pct_change()\n", - "msft_rets = msft.pct_change()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.rolling_corr(aapl_rets, msft_rets, 250).plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "model = pd.ols(y=aapl_rets, x={'MSFT': msft_rets}, window=250)\n", - "model.beta" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "model.beta['MSFT'].plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.10" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 0918478003a741970045ee5c5bf87eb4e50b77bd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 6 Nov 2015 14:25:08 -0800 Subject: [PATCH 9/9] Fixes for ch12 --- ch12.ipynb | 3516 ++++++++++++++++++++-------------------------------- 1 file changed, 1330 insertions(+), 2186 deletions(-) diff --git a/ch12.ipynb b/ch12.ipynb index c88f90d17..a9f5f55be 100644 --- a/ch12.ipynb +++ b/ch12.ipynb @@ -1,2189 +1,1333 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Advanced NumPy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from __future__ import division\n", + "from numpy.random import randn\n", + "from pandas import Series\n", + "import numpy as np\n", + "np.set_printoptions(precision=4)\n", + "import sys" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ndarray object internals" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NumPy dtype hierarchy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ints = np.ones(10, dtype=np.uint16)\n", + "floats = np.ones(10, dtype=np.float32)\n", + "np.issubdtype(ints.dtype, np.integer)\n", + "np.issubdtype(floats.dtype, np.floating)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.float64.mro()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced array manipulation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reshaping arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(8)\n", + "arr\n", + "arr.reshape((4, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr.reshape((4, 2)).reshape((2, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(15)\n", + "arr.reshape((5, -1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "other_arr = np.ones((3, 5))\n", + "other_arr.shape\n", + "arr.reshape(other_arr.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(15).reshape((5, 3))\n", + "arr\n", + "arr.ravel()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr.flatten()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### C vs. Fortran order" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(12).reshape((3, 4))\n", + "arr\n", + "arr.ravel()\n", + "arr.ravel('F')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Concatenating and splitting arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n", + "arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n", + "np.concatenate([arr1, arr2], axis=0)\n", + "np.concatenate([arr1, arr2], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.vstack((arr1, arr2))\n", + "np.hstack((arr1, arr2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from numpy.random import randn\n", + "arr = randn(5, 2)\n", + "arr\n", + "first, second, third = np.split(arr, [1, 3])\n", + "first\n", + "second\n", + "third" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Stacking helpers: " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(6)\n", + "arr1 = arr.reshape((3, 2))\n", + "arr2 = randn(3, 2)\n", + "np.r_[arr1, arr2]\n", + "np.c_[np.r_[arr1, arr2], arr]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.c_[1:6, -10:-5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Repeating elements: tile and repeat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(3)\n", + "arr.repeat(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr.repeat([2, 3, 4])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(2, 2)\n", + "arr\n", + "arr.repeat(2, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr.repeat([2, 3], axis=0)\n", + "arr.repeat([2, 3], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr\n", + "np.tile(arr, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr\n", + "np.tile(arr, (2, 1))\n", + "np.tile(arr, (3, 2))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Fancy indexing equivalents: take and put" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(10) * 100\n", + "inds = [7, 1, 2, 6]\n", + "arr[inds]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr.take(inds)\n", + "arr.put(inds, 42)\n", + "arr\n", + "arr.put(inds, [40, 41, 42, 43])\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "inds = [2, 0, 2, 1]\n", + "arr = randn(2, 4)\n", + "arr\n", + "arr.take(inds, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Broadcasting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(5)\n", + "arr\n", + "arr * 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(4, 3)\n", + "arr.mean(0)\n", + "demeaned = arr - arr.mean(0)\n", + "demeaned\n", + "demeaned.mean(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr\n", + "row_means = arr.mean(1)\n", + "row_means.reshape((4, 1))\n", + "demeaned = arr - row_means.reshape((4, 1))\n", + "demeaned.mean(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Broadcasting over other axes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr - arr.mean(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr - arr.mean(1).reshape((4, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.zeros((4, 4))\n", + "arr_3d = arr[:, np.newaxis, :]\n", + "arr_3d.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr_1d = np.random.normal(size=3)\n", + "arr_1d[:, np.newaxis]\n", + "arr_1d[np.newaxis, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(3, 4, 5)\n", + "depth_means = arr.mean(2)\n", + "depth_means\n", + "demeaned = arr - depth_means[:, :, np.newaxis]\n", + "demeaned.mean(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def demean_axis(arr, axis=0):\n", + " means = arr.mean(axis)\n", + "\n", + " # This generalized things like [:, :, np.newaxis] to N dimensions\n", + " indexer = [slice(None)] * arr.ndim\n", + " indexer[axis] = np.newaxis\n", + " return arr - means[indexer]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setting array values by broadcasting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.zeros((4, 3))\n", + "arr[:] = 5\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "col = np.array([1.28, -0.42, 0.44, 1.6])\n", + "arr[:] = col[:, np.newaxis]\n", + "arr\n", + "arr[:2] = [[-1.37], [0.509]]\n", + "arr" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced ufunc usage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Ufunc instance methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "np.add.reduce(arr)\n", + "arr.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.random.seed(12346)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(5, 5)\n", + "arr[::2].sort(1) # sort a few rows\n", + "arr[:, :-1] < arr[:, 1:]\n", + "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(15).reshape((3, 5))\n", + "np.add.accumulate(arr, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(3).repeat([1, 2, 2])\n", + "arr\n", + "np.multiply.outer(arr, np.arange(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "result = np.subtract.outer(randn(3, 4), randn(5))\n", + "result.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "np.add.reduceat(arr, [0, 5, 8])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.multiply.outer(np.arange(4), np.arange(5))\n", + "arr\n", + "np.add.reduceat(arr, [0, 2, 4], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom ufuncs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def add_elements(x, y):\n", + " return x + y\n", + "add_them = np.frompyfunc(add_elements, 2, 1)\n", + "add_them(np.arange(8), np.arange(8))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "add_them = np.vectorize(add_elements, otypes=[np.float64])\n", + "add_them(np.arange(8), np.arange(8))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(10000)\n", + "%timeit add_them(arr, arr)\n", + "%timeit np.add(arr, arr)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Structured and record arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dtype = [('x', np.float64), ('y', np.int32)]\n", + "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n", + "sarr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sarr[0]\n", + "sarr[0]['y']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "sarr['x']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Nested dtypes and multidimensional fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dtype = [('x', np.int64, 3), ('y', np.int32)]\n", + "arr = np.zeros(4, dtype=dtype)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[0]['x']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr['x']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]\n", + "data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)\n", + "data['x']\n", + "data['y']\n", + "data['x']['a']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Why use structured arrays?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Structured array manipulations: numpy.lib.recfunctions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## More about sorting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(6)\n", + "arr.sort()\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(3, 5)\n", + "arr\n", + "arr[:, 0].sort() # Sort first column values in-place\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(5)\n", + "arr\n", + "np.sort(arr)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(3, 5)\n", + "arr\n", + "arr.sort(axis=1)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr[:, ::-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Indirect sorts: argsort and lexsort" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "values = np.array([5, 0, 1, 3, 2])\n", + "indexer = values.argsort()\n", + "indexer\n", + "values[indexer]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = randn(3, 5)\n", + "arr[0] = values\n", + "arr\n", + "arr[:, arr[0].argsort()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])\n", + "last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])\n", + "sorter = np.lexsort((first_name, last_name))\n", + "zip(last_name[sorter], first_name[sorter])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Alternate sort algorithms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])\n", + "key = np.array([2, 2, 1, 1, 1])\n", + "indexer = key.argsort(kind='mergesort')\n", + "indexer\n", + "values.take(indexer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### numpy.searchsorted: Finding elements in a sorted array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.array([0, 1, 7, 12, 15])\n", + "arr.searchsorted(9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr.searchsorted([0, 8, 11, 16])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr = np.array([0, 0, 0, 1, 1, 1, 1])\n", + "arr.searchsorted([0, 1])\n", + "arr.searchsorted([0, 1], side='right')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "data = np.floor(np.random.uniform(0, 10000, size=50))\n", + "bins = np.array([0, 100, 1000, 5000, 10000])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "labels = bins.searchsorted(data)\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "Series(data).groupby(labels).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.digitize(data, bins)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## NumPy matrix class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "X = np.array([[ 8.82768214, 3.82222409, -1.14276475, 2.04411587],\n", + " [ 3.82222409, 6.75272284, 0.83909108, 2.08293758],\n", + " [-1.14276475, 0.83909108, 5.01690521, 0.79573241],\n", + " [ 2.04411587, 2.08293758, 0.79573241, 6.24095859]])\n", + "X[:, 0] # one-dimensional\n", + "y = X[:, :1] # two-dimensional by slicing\n", + "X\n", + "y" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "np.dot(y.T, np.dot(X, y))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "Xm = np.matrix(X)\n", + "ym = Xm[:, 0]\n", + "Xm\n", + "ym\n", + "ym.T * Xm * ym" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "Xm.I * X" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced array input and output" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Memory-mapped files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))\n", + "mmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "section = mmap[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "section[:] = np.random.randn(5, 10000)\n", + "mmap.flush()\n", + "mmap\n", + "del mmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))\n", + "mmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%xdel mmap\n", + "!rm mymmap" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HDF5 and other array storage options" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Performance tips" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The importance of contiguous memory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr_c = np.ones((1000, 1000), order='C')\n", + "arr_f = np.ones((1000, 1000), order='F')\n", + "arr_c.flags\n", + "arr_f.flags\n", + "arr_f.flags.f_contiguous" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%timeit arr_c.sum(1)\n", + "%timeit arr_f.sum(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr_f.copy('C').flags" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "arr_c[:50].flags.contiguous\n", + "arr_c[:, :50].flags" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%xdel arr_c\n", + "%xdel arr_f\n", + "%cd .." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Other speed options: Cython, f2py, C" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "```cython\n", + "from numpy cimport ndarray, float64_t\n", + "\n", + "def sum_elements(ndarray[float64_t] arr):\n", + " cdef Py_ssize_t i, n = len(arr)\n", + " cdef float64_t result = 0\n", + "\n", + " for i in range(n):\n", + " result += arr[i]\n", + "\n", + " return result\n", + "```" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:ee839a37538f481abf14fdebcd8219c687df840e35bf9d9437399180d9b23a7f" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Advanced NumPy" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from __future__ import division\n", - "from numpy.random import randn\n", - "from pandas import Series\n", - "import numpy as np\n", - "np.set_printoptions(precision=4)\n", - "import sys; sys.path.append('book_scripts')\n", - "%cd book_scripts" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "/home/phillip/Documents/code/py/pandas-book/rev_539000/book_scripts\n" - ] - } - ], - "prompt_number": 1 - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "ndarray object internals" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "NumPy dtype hierarchy" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ints = np.ones(10, dtype=np.uint16)\n", - "floats = np.ones(10, dtype=np.float32)\n", - "np.issubdtype(ints.dtype, np.integer)\n", - "np.issubdtype(floats.dtype, np.floating)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 2, - "text": [ - "True" - ] - } - ], - "prompt_number": 2 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.float64.mro()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 3, - "text": [ - "[numpy.float64,\n", - " numpy.floating,\n", - " numpy.inexact,\n", - " numpy.number,\n", - " numpy.generic,\n", - " float,\n", - " object]" - ] - } - ], - "prompt_number": 3 - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Advanced array manipulation" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reshaping arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(8)\n", - "arr\n", - "arr.reshape((4, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 4, - "text": [ - "array([[0, 1],\n", - " [2, 3],\n", - " [4, 5],\n", - " [6, 7]])" - ] - } - ], - "prompt_number": 4 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.reshape((4, 2)).reshape((2, 4))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 5, - "text": [ - "array([[0, 1, 2, 3],\n", - " [4, 5, 6, 7]])" - ] - } - ], - "prompt_number": 5 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15)\n", - "arr.reshape((5, -1))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 6, - "text": [ - "array([[ 0, 1, 2],\n", - " [ 3, 4, 5],\n", - " [ 6, 7, 8],\n", - " [ 9, 10, 11],\n", - " [12, 13, 14]])" - ] - } - ], - "prompt_number": 6 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "other_arr = np.ones((3, 5))\n", - "other_arr.shape\n", - "arr.reshape(other_arr.shape)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 7, - "text": [ - "array([[ 0, 1, 2, 3, 4],\n", - " [ 5, 6, 7, 8, 9],\n", - " [10, 11, 12, 13, 14]])" - ] - } - ], - "prompt_number": 7 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15).reshape((5, 3))\n", - "arr\n", - "arr.ravel()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 8, - "text": [ - "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])" - ] - } - ], - "prompt_number": 8 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.flatten()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 9, - "text": [ - "array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])" - ] - } - ], - "prompt_number": 9 - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "C vs. Fortran order" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(12).reshape((3, 4))\n", - "arr\n", - "arr.ravel()\n", - "arr.ravel('F')" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 10, - "text": [ - "array([ 0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11])" - ] - } - ], - "prompt_number": 10 - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Concatenating and splitting arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n", - "arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n", - "np.concatenate([arr1, arr2], axis=0)\n", - "np.concatenate([arr1, arr2], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 11, - "text": [ - "array([[ 1, 2, 3, 7, 8, 9],\n", - " [ 4, 5, 6, 10, 11, 12]])" - ] - } - ], - "prompt_number": 11 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.vstack((arr1, arr2))\n", - "np.hstack((arr1, arr2))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 12, - "text": [ - "array([[ 1, 2, 3, 7, 8, 9],\n", - " [ 4, 5, 6, 10, 11, 12]])" - ] - } - ], - "prompt_number": 12 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy.random import randn\n", - "arr = randn(5, 2)\n", - "arr\n", - "first, second, third = np.split(arr, [1, 3])\n", - "first\n", - "second\n", - "third" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 13, - "text": [ - "array([[ 0.0865, -0.0964],\n", - " [ 1.7154, 0.3276]])" - ] - } - ], - "prompt_number": 13 - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Stacking helpers: " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(6)\n", - "arr1 = arr.reshape((3, 2))\n", - "arr2 = randn(3, 2)\n", - "np.r_[arr1, arr2]\n", - "np.c_[np.r_[arr1, arr2], arr]" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 14, - "text": [ - "array([[ 0. , 1. , 0. ],\n", - " [ 2. , 3. , 1. ],\n", - " [ 4. , 5. , 2. ],\n", - " [-0.477 , 1.153 , 3. ],\n", - " [ 0.0919, -0.3852, 4. ],\n", - " [-1.891 , -1.4744, 5. ]])" - ] - } - ], - "prompt_number": 14 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.c_[1:6, -10:-5]" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 15, - "text": [ - "array([[ 1, -10],\n", - " [ 2, -9],\n", - " [ 3, -8],\n", - " [ 4, -7],\n", - " [ 5, -6]])" - ] - } - ], - "prompt_number": 15 - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Repeating elements: tile and repeat" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(3)\n", - "arr.repeat(3)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 16, - "text": [ - "array([0, 0, 0, 1, 1, 1, 2, 2, 2])" - ] - } - ], - "prompt_number": 16 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.repeat([2, 3, 4])" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 17, - "text": [ - "array([0, 0, 1, 1, 1, 2, 2, 2, 2])" - ] - } - ], - "prompt_number": 17 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(2, 2)\n", - "arr\n", - "arr.repeat(2, axis=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 18, - "text": [ - "array([[ 0.8373, -0.0382],\n", - " [ 0.8373, -0.0382],\n", - " [-2.3026, -3.1157],\n", - " [-2.3026, -3.1157]])" - ] - } - ], - "prompt_number": 18 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.repeat([2, 3], axis=0)\n", - "arr.repeat([2, 3], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 19, - "text": [ - "array([[ 0.8373, 0.8373, -0.0382, -0.0382, -0.0382],\n", - " [-2.3026, -2.3026, -3.1157, -3.1157, -3.1157]])" - ] - } - ], - "prompt_number": 19 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr\n", - "np.tile(arr, 2)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 20, - "text": [ - "array([[ 0.8373, -0.0382, 0.8373, -0.0382],\n", - " [-2.3026, -3.1157, -2.3026, -3.1157]])" - ] - } - ], - "prompt_number": 20 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr\n", - "np.tile(arr, (2, 1))\n", - "np.tile(arr, (3, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 21, - "text": [ - "array([[ 0.8373, -0.0382, 0.8373, -0.0382],\n", - " [-2.3026, -3.1157, -2.3026, -3.1157],\n", - " [ 0.8373, -0.0382, 0.8373, -0.0382],\n", - " [-2.3026, -3.1157, -2.3026, -3.1157],\n", - " [ 0.8373, -0.0382, 0.8373, -0.0382],\n", - " [-2.3026, -3.1157, -2.3026, -3.1157]])" - ] - } - ], - "prompt_number": 21 - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Fancy indexing equivalents: take and put" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10) * 100\n", - "inds = [7, 1, 2, 6]\n", - "arr[inds]" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 22, - "text": [ - "array([700, 100, 200, 600])" - ] - } - ], - "prompt_number": 22 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.take(inds)\n", - "arr.put(inds, 42)\n", - "arr\n", - "arr.put(inds, [40, 41, 42, 43])\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 23, - "text": [ - "array([ 0, 41, 42, 300, 400, 500, 43, 40, 800, 900])" - ] - } - ], - "prompt_number": 23 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "inds = [2, 0, 2, 1]\n", - "arr = randn(2, 4)\n", - "arr\n", - "arr.take(inds, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 24, - "text": [ - "array([[ 0.7526, -0.5752, 0.7526, -0.9173],\n", - " [ 0.5017, 0.8759, 0.5017, -0.4772]])" - ] - } - ], - "prompt_number": 24 - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Broadcasting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(5)\n", - "arr\n", - "arr * 4" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 25, - "text": [ - "array([ 0, 4, 8, 12, 16])" - ] - } - ], - "prompt_number": 25 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(4, 3)\n", - "arr.mean(0)\n", - "demeaned = arr - arr.mean(0)\n", - "demeaned\n", - "demeaned.mean(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 26, - "text": [ - "array([ 0., 0., -0.])" - ] - } - ], - "prompt_number": 26 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr\n", - "row_means = arr.mean(1)\n", - "row_means.reshape((4, 1))\n", - "demeaned = arr - row_means.reshape((4, 1))\n", - "demeaned.mean(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 27, - "text": [ - "array([ 0., -0., 0., -0.])" - ] - } - ], - "prompt_number": 27 - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Broadcasting over other axes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr - arr.mean(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "ename": "ValueError", - "evalue": "operands could not be broadcast together with shapes (4,3) (4,) ", - "output_type": "pyerr", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m\n\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", - "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0marr\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0marr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[1;31mValueError\u001b[0m: operands could not be broadcast together with shapes (4,3) (4,) " - ] - } - ], - "prompt_number": 28 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr - arr.mean(1).reshape((4, 1))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 29, - "text": [ - "array([[-1.6838, -0.3736, 2.0574],\n", - " [ 1.513 , -0.6511, -0.8619],\n", - " [ 0.4619, 0.3708, -0.8327],\n", - " [ 0.0346, -0.6972, 0.6626]])" - ] - } - ], - "prompt_number": 29 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.zeros((4, 4))\n", - "arr_3d = arr[:, np.newaxis, :]\n", - "arr_3d.shape" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 30, - "text": [ - "(4, 1, 4)" - ] - } - ], - "prompt_number": 30 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_1d = np.random.normal(size=3)\n", - "arr_1d[:, np.newaxis]\n", - "arr_1d[np.newaxis, :]" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 31, - "text": [ - "array([[ 0.6034, 0.4693, 0.6303]])" - ] - } - ], - "prompt_number": 31 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(3, 4, 5)\n", - "depth_means = arr.mean(2)\n", - "depth_means\n", - "demeaned = arr - depth_means[:, :, np.newaxis]\n", - "demeaned.mean(2)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 32, - "text": [ - "array([[ 0., 0., -0., 0.],\n", - " [ 0., 0., 0., -0.],\n", - " [ 0., -0., -0., -0.]])" - ] - } - ], - "prompt_number": 32 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def demean_axis(arr, axis=0):\n", - " means = arr.mean(axis)\n", - "\n", - " # This generalized things like [:, :, np.newaxis] to N dimensions\n", - " indexer = [slice(None)] * arr.ndim\n", - " indexer[axis] = np.newaxis\n", - " return arr - means[indexer]" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 34 - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Setting array values by broadcasting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.zeros((4, 3))\n", - "arr[:] = 5\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 35, - "text": [ - "array([[ 5., 5., 5.],\n", - " [ 5., 5., 5.],\n", - " [ 5., 5., 5.],\n", - " [ 5., 5., 5.]])" - ] - } - ], - "prompt_number": 35 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "col = np.array([1.28, -0.42, 0.44, 1.6])\n", - "arr[:] = col[:, np.newaxis]\n", - "arr\n", - "arr[:2] = [[-1.37], [0.509]]\n", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 36, - "text": [ - "array([[-1.37 , -1.37 , -1.37 ],\n", - " [ 0.509, 0.509, 0.509],\n", - " [ 0.44 , 0.44 , 0.44 ],\n", - " [ 1.6 , 1.6 , 1.6 ]])" - ] - } - ], - "prompt_number": 36 - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Advanced ufunc usage" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Ufunc instance methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)\n", - "np.add.reduce(arr)\n", - "arr.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 37, - "text": [ - "45" - ] - } - ], - "prompt_number": 37 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12346)" - ], - "language": "python", - "metadata": {}, - "outputs": [], - "prompt_number": 38 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(5, 5)\n", - "arr[::2].sort(1) # sort a few rows\n", - "arr[:, :-1] < arr[:, 1:]\n", - "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 39, - "text": [ - "array([ True, False, True, False, True], dtype=bool)" - ] - } - ], - "prompt_number": 39 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15).reshape((3, 5))\n", - "np.add.accumulate(arr, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 40, - "text": [ - "array([[ 0, 1, 3, 6, 10],\n", - " [ 5, 11, 18, 26, 35],\n", - " [10, 21, 33, 46, 60]])" - ] - } - ], - "prompt_number": 40 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(3).repeat([1, 2, 2])\n", - "arr\n", - "np.multiply.outer(arr, np.arange(5))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 41, - "text": [ - "array([[0, 0, 0, 0, 0],\n", - " [0, 1, 2, 3, 4],\n", - " [0, 1, 2, 3, 4],\n", - " [0, 2, 4, 6, 8],\n", - " [0, 2, 4, 6, 8]])" - ] - } - ], - "prompt_number": 41 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = np.subtract.outer(randn(3, 4), randn(5))\n", - "result.shape" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 42, - "text": [ - "(3, 4, 5)" - ] - } - ], - "prompt_number": 42 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)\n", - "np.add.reduceat(arr, [0, 5, 8])" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 43, - "text": [ - "array([10, 18, 17])" - ] - } - ], - "prompt_number": 43 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.multiply.outer(np.arange(4), np.arange(5))\n", - "arr\n", - "np.add.reduceat(arr, [0, 2, 4], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 44, - "text": [ - "array([[ 0, 0, 0],\n", - " [ 1, 5, 4],\n", - " [ 2, 10, 8],\n", - " [ 3, 15, 12]])" - ] - } - ], - "prompt_number": 44 - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Custom ufuncs" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def add_elements(x, y):\n", - " return x + y\n", - "add_them = np.frompyfunc(add_elements, 2, 1)\n", - "add_them(np.arange(8), np.arange(8))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 45, - "text": [ - "array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)" - ] - } - ], - "prompt_number": 45 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "add_them = np.vectorize(add_elements, otypes=[np.float64])\n", - "add_them(np.arange(8), np.arange(8))" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 46, - "text": [ - "array([ 0., 2., 4., 6., 8., 10., 12., 14.])" - ] - } - ], - "prompt_number": 46 - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = randn(10000)\n", - "%timeit add_them(arr, arr)\n", - "%timeit np.add(arr, arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "1000 loops, best of 3: 1.7 ms per loop\n", - "100000 loops, best of 3: 4.68 \u00b5s per loop" - ] - }, - { - "output_type": "stream", - "stream": "stdout", - "text": [ - "\n" - ] - } - ], - "prompt_number": 47 - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Structured and record arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dtype = [('x', np.float64), ('y', np.int32)]\n", - "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n", - "sarr" - ], - "language": "python", - "metadata": {}, - "outputs": [ - { - "metadata": {}, - "output_type": "pyout", - "prompt_number": 48, - "text": [ - "array([(1.5, 6), (3.141592653589793, -2)], \n", - " dtype=[('x', '