diff --git a/appa.ipynb b/appa.ipynb index 9c600fdfc..33fb5a219 100644 --- a/appa.ipynb +++ b/appa.ipynb @@ -1,1354 +1,1686 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Advanced NumPy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## ndarray Object Internals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.ones((10, 5)).shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.ones((3, 4, 5), dtype=np.float64).strides" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### NumPy dtype Hierarchy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ints = np.ones(10, dtype=np.uint16)\n", + "floats = np.ones(10, dtype=np.float32)\n", + "np.issubdtype(ints.dtype, np.integer)\n", + "np.issubdtype(floats.dtype, np.floating)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.float64.mro()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.issubdtype(ints.dtype, np.number)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Advanced Array Manipulation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Reshaping Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(8)\n", + "arr\n", + "arr.reshape((4, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr.reshape((4, 2)).reshape((2, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(15)\n", + "arr.reshape((5, -1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "other_arr = np.ones((3, 5))\n", + "other_arr.shape\n", + "arr.reshape(other_arr.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(15).reshape((5, 3))\n", + "arr\n", + "arr.ravel()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr.flatten()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### C Versus Fortran Order" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(12).reshape((3, 4))\n", + "arr\n", + "arr.ravel()\n", + "arr.ravel('F')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Concatenating and Splitting Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr1 = np.array([[1, 2, 3], [4, 5, 6]])\n", + "arr2 = np.array([[7, 8, 9], [10, 11, 12]])\n", + "np.concatenate([arr1, arr2], axis=0)\n", + "np.concatenate([arr1, arr2], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.vstack((arr1, arr2))\n", + "np.hstack((arr1, arr2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(5, 2)\n", + "arr\n", + "first, second, third = np.split(arr, [1, 3])\n", + "first\n", + "second\n", + "third" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Stacking helpers: r_ and c_" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(6)\n", + "arr1 = arr.reshape((3, 2))\n", + "arr2 = np.random.randn(3, 2)\n", + "np.r_[arr1, arr2]\n", + "np.c_[np.r_[arr1, arr2], arr]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.c_[1:6, -10:-5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Repeating Elements: tile and repeat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(3)\n", + "arr\n", + "arr.repeat(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr.repeat([2, 3, 4])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(2, 2)\n", + "arr\n", + "arr.repeat(2, axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr.repeat([2, 3], axis=0)\n", + "arr.repeat([2, 3], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr\n", + "np.tile(arr, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr\n", + "np.tile(arr, (2, 1))\n", + "np.tile(arr, (3, 2))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Fancy Indexing Equivalents: take and put" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(10) * 100\n", + "inds = [7, 1, 2, 6]\n", + "arr[inds]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr.take(inds)\n", + "arr.put(inds, 42)\n", + "arr\n", + "arr.put(inds, [40, 41, 42, 43])\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "inds = [2, 0, 2, 1]\n", + "arr = np.random.randn(2, 4)\n", + "arr\n", + "arr.take(inds, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Broadcasting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(5)\n", + "arr\n", + "arr * 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(4, 3)\n", + "arr.mean(0)\n", + "demeaned = arr - arr.mean(0)\n", + "demeaned\n", + "demeaned.mean(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr\n", + "row_means = arr.mean(1)\n", + "row_means.shape\n", + "row_means.reshape((4, 1))\n", + "demeaned = arr - row_means.reshape((4, 1))\n", + "demeaned.mean(1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Broadcasting Over Other Axes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr - arr.mean(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr - arr.mean(1).reshape((4, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.zeros((4, 4))\n", + "arr_3d = arr[:, np.newaxis, :]\n", + "arr_3d.shape\n", + "arr_1d = np.random.normal(size=3)\n", + "arr_1d[:, np.newaxis]\n", + "arr_1d[np.newaxis, :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(3, 4, 5)\n", + "depth_means = arr.mean(2)\n", + "depth_means\n", + "depth_means.shape\n", + "demeaned = arr - depth_means[:, :, np.newaxis]\n", + "demeaned.mean(2)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def demean_axis(arr, axis=0):\n", + " means = arr.mean(axis)\n", + "\n", + " # This generalizes things like [:, :, np.newaxis] to N dimensions\n", + " indexer = [slice(None)] * arr.ndim\n", + " indexer[axis] = np.newaxis\n", + " return arr - means[indexer]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Setting Array Values by Broadcasting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.zeros((4, 3))\n", + "arr[:] = 5\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "col = np.array([1.28, -0.42, 0.44, 1.6])\n", + "arr[:] = col[:, np.newaxis]\n", + "arr\n", + "arr[:2] = [[-1.37], [0.509]]\n", + "arr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Advanced ufunc Usage" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### ufunc Instance Methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "np.add.reduce(arr)\n", + "arr.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.random.seed(12346) # for reproducibility\n", + "arr = np.random.randn(5, 5)\n", + "arr[::2].sort(1) # sort a few rows\n", + "arr[:, :-1] < arr[:, 1:]\n", + "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(15).reshape((3, 5))\n", + "np.add.accumulate(arr, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(3).repeat([1, 2, 2])\n", + "arr\n", + "np.multiply.outer(arr, np.arange(5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "x, y = np.random.randn(3, 4), np.random.randn(5)\n", + "result = np.subtract.outer(x, y)\n", + "result.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "np.add.reduceat(arr, [0, 5, 8])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.multiply.outer(np.arange(4), np.arange(5))\n", + "arr\n", + "np.add.reduceat(arr, [0, 2, 4], axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Writing New ufuncs in Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def add_elements(x, y):\n", + " return x + y\n", + "add_them = np.frompyfunc(add_elements, 2, 1)\n", + "add_them(np.arange(8), np.arange(8))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "add_them = np.vectorize(add_elements, otypes=[np.float64])\n", + "add_them(np.arange(8), np.arange(8))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(10000)\n", + "%timeit add_them(arr, arr)\n", + "%timeit np.add(arr, arr)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Structured and Record Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dtype = [('x', np.float64), ('y', np.int32)]\n", + "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)\n", + "sarr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sarr[0]\n", + "sarr[0]['y']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sarr['x']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Nested dtypes and Multidimensional Fields" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dtype = [('x', np.int64, 3), ('y', np.int32)]\n", + "arr = np.zeros(4, dtype=dtype)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr[0]['x']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr['x']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]\n", + "data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)\n", + "data['x']\n", + "data['y']\n", + "data['x']['a']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Why Use Structured Arrays?" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## More About Sorting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(6)\n", + "arr.sort()\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(3, 5)\n", + "arr\n", + "arr[:, 0].sort() # Sort first column values in-place\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(5)\n", + "arr\n", + "np.sort(arr)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(3, 5)\n", + "arr\n", + "arr.sort(axis=1)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr[:, ::-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Indirect Sorts: argsort and lexsort" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "values = np.array([5, 0, 1, 3, 2])\n", + "indexer = values.argsort()\n", + "indexer\n", + "values[indexer]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(3, 5)\n", + "arr[0] = values\n", + "arr\n", + "arr[:, arr[0].argsort()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])\n", + "last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])\n", + "sorter = np.lexsort((first_name, last_name))\n", + "sorter\n", + "zip(last_name[sorter], first_name[sorter])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Alternative Sort Algorithms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "values = np.array(['2:first', '2:second', '1:first', '1:second',\n", + " '1:third'])\n", + "key = np.array([2, 2, 1, 1, 1])\n", + "indexer = key.argsort(kind='mergesort')\n", + "indexer\n", + "values.take(indexer)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Partially Sorting Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.random.seed(12345)\n", + "arr = np.random.randn(20)\n", + "arr\n", + "np.partition(arr, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "indices = np.argpartition(arr, 3)\n", + "indices\n", + "arr.take(indices)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### numpy.searchsorted: Finding Elements in a Sorted Array" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.array([0, 1, 7, 12, 15])\n", + "arr.searchsorted(9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr.searchsorted([0, 8, 11, 16])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.array([0, 0, 0, 1, 1, 1, 1])\n", + "arr.searchsorted([0, 1])\n", + "arr.searchsorted([0, 1], side='right')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = np.floor(np.random.uniform(0, 10000, size=50))\n", + "bins = np.array([0, 100, 1000, 5000, 10000])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "labels = bins.searchsorted(data)\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.Series(data).groupby(labels).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Writing Fast NumPy Functions with Numba" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "def mean_distance(x, y):\n", + " nx = len(x)\n", + " result = 0.0\n", + " count = 0\n", + " for i in range(nx):\n", + " result += x[i] - y[i]\n", + " count += 1\n", + " return result / count" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [209]: x = np.random.randn(10000000)\n", + "\n", + "In [210]: y = np.random.randn(10000000)\n", + "\n", + "In [211]: %timeit mean_distance(x, y)\n", + "1 loop, best of 3: 2 s per loop\n", + "\n", + "In [212]: %timeit (x - y).mean()\n", + "100 loops, best of 3: 14.7 ms per loop" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [213]: import numba as nb\n", + "\n", + "In [214]: numba_mean_distance = nb.jit(mean_distance)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "@nb.jit\n", + "def mean_distance(x, y):\n", + " nx = len(x)\n", + " result = 0.0\n", + " count = 0\n", + " for i in range(nx):\n", + " result += x[i] - y[i]\n", + " count += 1\n", + " return result / count" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [215]: %timeit numba_mean_distance(x, y)\n", + "100 loops, best of 3: 10.3 ms per loop" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from numba import float64, njit\n", + "\n", + "@njit(float64(float64[:], float64[:]))\n", + "def mean_distance(x, y):\n", + " return (x - y).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Creating Custom numpy.ufunc Objects with Numba" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from numba import vectorize\n", + "\n", + "@vectorize\n", + "def nb_add(x, y):\n", + " return x + y" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [13]: x = np.arange(10)\n", + "\n", + "In [14]: nb_add(x, x)\n", + "Out[14]: array([ 0., 2., 4., 6., 8., 10., 12., 14., 16., 18.])\n", + "\n", + "In [15]: nb_add.accumulate(x, 0)\n", + "Out[15]: array([ 0., 1., 3., 6., 10., 15., 21., 28., 36., 45.])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Advanced Array Input and Output" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Memory-Mapped Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mmap = np.memmap('mymmap', dtype='float64', mode='w+',\n", + " shape=(10000, 10000))\n", + "mmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "section = mmap[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "section[:] = np.random.randn(5, 10000)\n", + "mmap.flush()\n", + "mmap\n", + "del mmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))\n", + "mmap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "%xdel mmap\n", + "!rm mymmap" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### HDF5 and Other Array Storage Options" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Performance Tips" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### The Importance of Contiguous Memory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr_c = np.ones((1000, 1000), order='C')\n", + "arr_f = np.ones((1000, 1000), order='F')\n", + "arr_c.flags\n", + "arr_f.flags\n", + "arr_f.flags.f_contiguous" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "%timeit arr_c.sum(1)\n", + "%timeit arr_f.sum(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr_f.copy('C').flags" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr_c[:50].flags.contiguous\n", + "arr_c[:, :50].flags" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "%xdel arr_c\n", + "%xdel arr_f" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + } + ], "metadata": { - "name": "generated_appa" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Advanced NumPy" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "ndarray Object Internals" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.ones((10, 5)).shape" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.ones((3, 4, 5), dtype=np.float64).strides" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "NumPy dtype Hierarchy" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ints = np.ones(10, dtype=np.uint16)", - "floats = np.ones(10, dtype=np.float32)", - "np.issubdtype(ints.dtype, np.integer)", - "np.issubdtype(floats.dtype, np.floating)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.float64.mro()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.issubdtype(ints.dtype, np.number)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Advanced Array Manipulation" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reshaping Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(8)", - "arr", - "arr.reshape((4, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.reshape((4, 2)).reshape((2, 4))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15)", - "arr.reshape((5, -1))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "other_arr = np.ones((3, 5))", - "other_arr.shape", - "arr.reshape(other_arr.shape)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15).reshape((5, 3))", - "arr", - "arr.ravel()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.flatten()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "C Versus Fortran Order" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(12).reshape((3, 4))", - "arr", - "arr.ravel()", - "arr.ravel('F')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Concatenating and Splitting Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr1 = np.array([[1, 2, 3], [4, 5, 6]])", - "arr2 = np.array([[7, 8, 9], [10, 11, 12]])", - "np.concatenate([arr1, arr2], axis=0)", - "np.concatenate([arr1, arr2], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.vstack((arr1, arr2))", - "np.hstack((arr1, arr2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(5, 2)", - "arr", - "first, second, third = np.split(arr, [1, 3])", - "first", - "second", - "third" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Stacking helpers: r_ and c_" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(6)", - "arr1 = arr.reshape((3, 2))", - "arr2 = np.random.randn(3, 2)", - "np.r_[arr1, arr2]", - "np.c_[np.r_[arr1, arr2], arr]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.c_[1:6, -10:-5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Repeating Elements: tile and repeat" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(3)", - "arr", - "arr.repeat(3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.repeat([2, 3, 4])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(2, 2)", - "arr", - "arr.repeat(2, axis=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.repeat([2, 3], axis=0)", - "arr.repeat([2, 3], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr", - "np.tile(arr, 2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr", - "np.tile(arr, (2, 1))", - "np.tile(arr, (3, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Fancy Indexing Equivalents: take and put" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10) * 100", - "inds = [7, 1, 2, 6]", - "arr[inds]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.take(inds)", - "arr.put(inds, 42)", - "arr", - "arr.put(inds, [40, 41, 42, 43])", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "inds = [2, 0, 2, 1]", - "arr = np.random.randn(2, 4)", - "arr", - "arr.take(inds, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Broadcasting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(5)", - "arr", - "arr * 4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(4, 3)", - "arr.mean(0)", - "demeaned = arr - arr.mean(0)", - "demeaned", - "demeaned.mean(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr", - "row_means = arr.mean(1)", - "row_means.shape", - "row_means.reshape((4, 1))", - "demeaned = arr - row_means.reshape((4, 1))", - "demeaned.mean(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Broadcasting Over Other Axes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr - arr.mean(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr - arr.mean(1).reshape((4, 1))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.zeros((4, 4))", - "arr_3d = arr[:, np.newaxis, :]", - "arr_3d.shape", - "arr_1d = np.random.normal(size=3)", - "arr_1d[:, np.newaxis]", - "arr_1d[np.newaxis, :]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(3, 4, 5)", - "depth_means = arr.mean(2)", - "depth_means", - "depth_means.shape", - "demeaned = arr - depth_means[:, :, np.newaxis]", - "demeaned.mean(2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def demean_axis(arr, axis=0):", - " means = arr.mean(axis)", - "", - " # This generalizes things like [:, :, np.newaxis] to N dimensions", - " indexer = [slice(None)] * arr.ndim", - " indexer[axis] = np.newaxis", - " return arr - means[indexer]" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Setting Array Values by Broadcasting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.zeros((4, 3))", - "arr[:] = 5", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "col = np.array([1.28, -0.42, 0.44, 1.6])", - "arr[:] = col[:, np.newaxis]", - "arr", - "arr[:2] = [[-1.37], [0.509]]", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Advanced ufunc Usage" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "ufunc Instance Methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)", - "np.add.reduce(arr)", - "arr.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12346) # for reproducibility", - "arr = np.random.randn(5, 5)", - "arr[::2].sort(1) # sort a few rows", - "arr[:, :-1] < arr[:, 1:]", - "np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15).reshape((3, 5))", - "np.add.accumulate(arr, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(3).repeat([1, 2, 2])", - "arr", - "np.multiply.outer(arr, np.arange(5))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x, y = np.random.randn(3, 4), np.random.randn(5)", - "result = np.subtract.outer(x, y)", - "result.shape" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)", - "np.add.reduceat(arr, [0, 5, 8])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.multiply.outer(np.arange(4), np.arange(5))", - "arr", - "np.add.reduceat(arr, [0, 2, 4], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Writing New ufuncs in Python" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def add_elements(x, y):", - " return x + y", - "add_them = np.frompyfunc(add_elements, 2, 1)", - "add_them(np.arange(8), np.arange(8))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "add_them = np.vectorize(add_elements, otypes=[np.float64])", - "add_them(np.arange(8), np.arange(8))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(10000)", - "%timeit add_them(arr, arr)", - "%timeit np.add(arr, arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Structured and Record Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dtype = [('x', np.float64), ('y', np.int32)]", - "sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)", - "sarr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sarr[0]", - "sarr[0]['y']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sarr['x']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Nested dtypes and Multidimensional Fields" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dtype = [('x', np.int64, 3), ('y', np.int32)]", - "arr = np.zeros(4, dtype=dtype)", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[0]['x']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr['x']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]", - "data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)", - "data['x']", - "data['y']", - "data['x']['a']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Why Use Structured Arrays?" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "More About Sorting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(6)", - "arr.sort()", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(3, 5)", - "arr", - "arr[:, 0].sort() # Sort first column values in-place", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(5)", - "arr", - "np.sort(arr)", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(3, 5)", - "arr", - "arr.sort(axis=1)", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[:, ::-1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Indirect Sorts: argsort and lexsort" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = np.array([5, 0, 1, 3, 2])", - "indexer = values.argsort()", - "indexer", - "values[indexer]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(3, 5)", - "arr[0] = values", - "arr", - "arr[:, arr[0].argsort()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])", - "last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])", - "sorter = np.lexsort((first_name, last_name))", - "sorter", - "zip(last_name[sorter], first_name[sorter])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Alternative Sort Algorithms" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = np.array(['2:first', '2:second', '1:first', '1:second',", - " '1:third'])", - "key = np.array([2, 2, 1, 1, 1])", - "indexer = key.argsort(kind='mergesort')", - "indexer", - "values.take(indexer)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Partially Sorting Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)", - "arr = np.random.randn(20)", - "arr", - "np.partition(arr, 3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "indices = np.argpartition(arr, 3)", - "indices", - "arr.take(indices)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "numpy.searchsorted: Finding Elements in a Sorted Array" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([0, 1, 7, 12, 15])", - "arr.searchsorted(9)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.searchsorted([0, 8, 11, 16])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([0, 0, 0, 1, 1, 1, 1])", - "arr.searchsorted([0, 1])", - "arr.searchsorted([0, 1], side='right')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = np.floor(np.random.uniform(0, 10000, size=50))", - "bins = np.array([0, 100, 1000, 5000, 10000])", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "labels = bins.searchsorted(data)", - "labels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.Series(data).groupby(labels).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Writing Fast NumPy Functions with Numba" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import numpy as np", - "", - "def mean_distance(x, y):", - " nx = len(x)", - " result = 0.0", - " count = 0", - " for i in range(nx):", - " result += x[i] - y[i]", - " count += 1", - " return result / count" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [209]: x = np.random.randn(10000000)", - "", - "In [210]: y = np.random.randn(10000000)", - "", - "In [211]: %timeit mean_distance(x, y)", - "1 loop, best of 3: 2 s per loop", - "", - "In [212]: %timeit (x - y).mean()", - "100 loops, best of 3: 14.7 ms per loop" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [213]: import numba as nb", - "", - "In [214]: numba_mean_distance = nb.jit(mean_distance)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "@nb.jit", - "def mean_distance(x, y):", - " nx = len(x)", - " result = 0.0", - " count = 0", - " for i in range(nx):", - " result += x[i] - y[i]", - " count += 1", - " return result / count" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [215]: %timeit numba_mean_distance(x, y)", - "100 loops, best of 3: 10.3 ms per loop" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from numba import float64, njit", - "", - "@njit(float64(float64[:], float64[:]))", - "def mean_distance(x, y):", - " return (x - y).mean()" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Creating Custom numpy.ufunc Objects with Numba" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from numba import vectorize", - "", - "@vectorize", - "def nb_add(x, y):", - " return x + y" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [13]: x = np.arange(10)", - "", - "In [14]: nb_add(x, x)", - "Out[14]: array([ 0., 2., 4., 6., 8., 10., 12., 14., 16., 18.])", - "", - "In [15]: nb_add.accumulate(x, 0)", - "Out[15]: array([ 0., 1., 3., 6., 10., 15., 21., 28., 36., 45.])" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Advanced Array Input and Output" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Memory-Mapped Files" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mmap = np.memmap('mymmap', dtype='float64', mode='w+',", - " shape=(10000, 10000))", - "mmap" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "section = mmap[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "section[:] = np.random.randn(5, 10000)", - "mmap.flush()", - "mmap", - "del mmap" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))", - "mmap" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%xdel mmap", - "!rm mymmap" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "HDF5 and Other Array Storage Options" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Performance Tips" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "The Importance of Contiguous Memory" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_c = np.ones((1000, 1000), order='C')", - "arr_f = np.ones((1000, 1000), order='F')", - "arr_c.flags", - "arr_f.flags", - "arr_f.flags.f_contiguous" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%timeit arr_c.sum(1)", - "%timeit arr_f.sum(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_f.copy('C').flags" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_c[:50].flags.contiguous", - "arr_c[:, :50].flags" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%xdel arr_c", - "%xdel arr_f" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/appb.ipynb b/appb.ipynb index 881c29084..8911a553a 100644 --- a/appb.ipynb +++ b/appb.ipynb @@ -1,901 +1,1120 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# More on the IPython System" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Using the Command History" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Searching and Reusing the Command History" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In[7]: %run first/second/third/data_script.py" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [1]: a_command = foo(x, y, z)\n", + "\n", + "(reverse-i-search)`com': a_command = foo(x, y, z)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Input and Output Variables" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [24]: 2 ** 27\n", + "Out[24]: 134217728\n", + "\n", + "In [25]: _\n", + "Out[25]: 134217728" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [26]: foo = 'bar'\n", + "\n", + "In [27]: foo\n", + "Out[27]: 'bar'\n", + "\n", + "In [28]: _i27\n", + "Out[28]: u'foo'\n", + "\n", + "In [29]: _27\n", + "Out[29]: 'bar'" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [30]: exec(_i27)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Interacting with the Operating System" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Shell Commands and Aliases" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [1]: ip_info = !ifconfig wlan0 | grep \"inet \"\n", + "\n", + "In [2]: ip_info[0].strip()\n", + "Out[2]: 'inet addr:10.0.0.11 Bcast:10.0.0.255 Mask:255.255.255.0'" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [3]: foo = 'test*'\n", + "\n", + "In [4]: !ls $foo\n", + "test4.py test.py test.xml" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [1]: %alias ll ls -l\n", + "\n", + "In [2]: ll /usr\n", + "total 332\n", + "drwxr-xr-x 2 root root 69632 2012-01-29 20:36 bin/\n", + "drwxr-xr-x 2 root root 4096 2010-08-23 12:05 games/\n", + "drwxr-xr-x 123 root root 20480 2011-12-26 18:08 include/\n", + "drwxr-xr-x 265 root root 126976 2012-01-29 20:36 lib/\n", + "drwxr-xr-x 44 root root 69632 2011-12-26 18:08 lib32/\n", + "lrwxrwxrwx 1 root root 3 2010-08-23 16:02 lib64 -> lib/\n", + "drwxr-xr-x 15 root root 4096 2011-10-13 19:03 local/\n", + "drwxr-xr-x 2 root root 12288 2012-01-12 09:32 sbin/\n", + "drwxr-xr-x 387 root root 12288 2011-11-04 22:53 share/\n", + "drwxrwsr-x 24 root src 4096 2011-07-17 18:38 src/" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [558]: %alias test_alias (cd examples; ls; cd ..)\n", + "\n", + "In [559]: test_alias\n", + "macrodata.csv spx.csv tips.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Directory Bookmark System" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [6]: %bookmark py4da /home/wesm/code/pydata-book" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [7]: cd py4da\n", + "(bookmark:py4da) -> /home/wesm/code/pydata-book\n", + "/home/wesm/code/pydata-book" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [8]: %bookmark -l\n", + "Current bookmarks:\n", + "py4da -> /home/wesm/code/pydata-book-source" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Software Development Tools" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Interactive Debugger" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [2]: run examples/ipython_bug.py\n", + "---------------------------------------------------------------------------\n", + "AssertionError Traceback (most recent call last)\n", + "/home/wesm/code/pydata-book/examples/ipython_bug.py in ()\n", + " 13 throws_an_exception()\n", + " 14\n", + "---> 15 calling_things()\n", + "\n", + "/home/wesm/code/pydata-book/examples/ipython_bug.py in calling_things()\n", + " 11 def calling_things():\n", + " 12 works_fine()\n", + "---> 13 throws_an_exception()\n", + " 14\n", + " 15 calling_things()\n", + "\n", + "/home/wesm/code/pydata-book/examples/ipython_bug.py in throws_an_exception()\n", + " 7 a = 5\n", + " 8 b = 6\n", + "----> 9 assert(a + b == 10)\n", + " 10\n", + " 11 def calling_things():\n", + "\n", + "AssertionError:\n", + "\n", + "In [3]: %debug\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(9)throws_an_exception()\n", + " 8 b = 6\n", + "----> 9 assert(a + b == 10)\n", + " 10\n", + "\n", + "ipdb>" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ipdb> u\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(13)calling_things()\n", + " 12 works_fine()\n", + "---> 13 throws_an_exception()\n", + " 14" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [5]: run -d examples/ipython_bug.py\n", + "Breakpoint 1 at /home/wesm/code/pydata-book/examples/ipython_bug.py:1\n", + "NOTE: Enter 'c' at the ipdb> prompt to start your script.\n", + "> (1)()\n", + "\n", + "ipdb> s\n", + "--Call--\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(1)()\n", + "1---> 1 def works_fine():\n", + " 2 a = 5\n", + " 3 b = 6" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ipdb> b 12\n", + "ipdb> c\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(12)calling_things()\n", + " 11 def calling_things():\n", + "2--> 12 works_fine()\n", + " 13 throws_an_exception()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ipdb> n\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(13)calling_things()\n", + "2 12 works_fine()\n", + "---> 13 throws_an_exception()\n", + " 14" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ipdb> s\n", + "--Call--\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(6)throws_an_exception()\n", + " 5\n", + "----> 6 def throws_an_exception():\n", + " 7 a = 5\n", + "\n", + "ipdb> n\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(7)throws_an_exception()\n", + " 6 def throws_an_exception():\n", + "----> 7 a = 5\n", + " 8 b = 6\n", + "\n", + "ipdb> n\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(8)throws_an_exception()\n", + " 7 a = 5\n", + "----> 8 b = 6\n", + " 9 assert(a + b == 10)\n", + "\n", + "ipdb> n\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(9)throws_an_exception()\n", + " 8 b = 6\n", + "----> 9 assert(a + b == 10)\n", + " 10\n", + "\n", + "ipdb> !a\n", + "5\n", + "ipdb> !b\n", + "6" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Other ways to make use of the debugger" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from IPython.core.debugger import Pdb\n", + "\n", + "def set_trace():\n", + " Pdb(color_scheme='Linux').set_trace(sys._getframe().f_back)\n", + "\n", + "def debug(f, *args, **kwargs):\n", + " pdb = Pdb(color_scheme='Linux')\n", + " return pdb.runcall(f, *args, **kwargs)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [7]: run examples/ipython_bug.py\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(16)calling_things()\n", + " 15 set_trace()\n", + "---> 16 throws_an_exception()\n", + " 17" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def f(x, y, z=1):\n", + " tmp = x + y\n", + " return tmp / z" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [6]: debug(f, 1, 2, z=3)\n", + "> (2)f()\n", + " 1 def f(x, y, z):\n", + "----> 2 tmp = x + y\n", + " 3 return tmp / z\n", + "\n", + "ipdb>" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [1]: %run -d examples/ipython_bug.py\n", + "Breakpoint 1 at /home/wesm/code/pydata-book/examples/ipython_bug.py:1\n", + "NOTE: Enter 'c' at the ipdb> prompt to start your script.\n", + "> (1)()\n", + "\n", + "ipdb>" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [2]: %run -d -b2 examples/ipython_bug.py\n", + "Breakpoint 1 at /home/wesm/code/pydata-book/examples/ipython_bug.py:2\n", + "NOTE: Enter 'c' at the ipdb> prompt to start your script.\n", + "> (1)()\n", + "\n", + "ipdb> c\n", + "> /home/wesm/code/pydata-book/examples/ipython_bug.py(2)works_fine()\n", + " 1 def works_fine():\n", + "1---> 2 a = 5\n", + " 3 b = 6\n", + "\n", + "ipdb>" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Timing Code: %time and %timeit" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "import time\n", + "start = time.time()\n", + "for i in range(iterations):\n", + " # some code to run here\n", + "elapsed_per = (time.time() - start) / iterations" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# a very large list of strings\n", + "strings = ['foo', 'foobar', 'baz', 'qux',\n", + " 'python', 'Guido Van Rossum'] * 100000\n", + "\n", + "method1 = [x for x in strings if x.startswith('foo')]\n", + "\n", + "method2 = [x for x in strings if x[:3] == 'foo']" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [561]: %time method1 = [x for x in strings if x.startswith('foo')]\n", + "CPU times: user 0.19 s, sys: 0.00 s, total: 0.19 s\n", + "Wall time: 0.19 s\n", + "\n", + "In [562]: %time method2 = [x for x in strings if x[:3] == 'foo']\n", + "CPU times: user 0.09 s, sys: 0.00 s, total: 0.09 s\n", + "Wall time: 0.09 s" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [563]: %timeit [x for x in strings if x.startswith('foo')]\n", + "10 loops, best of 3: 159 ms per loop\n", + "\n", + "In [564]: %timeit [x for x in strings if x[:3] == 'foo']\n", + "10 loops, best of 3: 59.3 ms per loop" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [565]: x = 'foobar'\n", + "\n", + "In [566]: y = 'foo'\n", + "\n", + "In [567]: %timeit x.startswith(y)\n", + "1000000 loops, best of 3: 267 ns per loop\n", + "\n", + "In [568]: %timeit x[:3] == y\n", + "10000000 loops, best of 3: 147 ns per loop" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Basic Profiling: %prun and %run -p" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "import numpy as np\n", + "from numpy.linalg import eigvals\n", + "\n", + "def run_experiment(niter=100):\n", + " K = 100\n", + " results = []\n", + " for _ in xrange(niter):\n", + " mat = np.random.randn(K, K)\n", + " max_eigenvalue = np.abs(eigvals(mat)).max()\n", + " results.append(max_eigenvalue)\n", + " return results\n", + "some_results = run_experiment()\n", + "print 'Largest one we saw: %s' % np.max(some_results)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "python -m cProfile cprof_example.py" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ python -m cProfile -s cumulative cprof_example.py\n", + "Largest one we saw: 11.923204422\n", + " 15116 function calls (14927 primitive calls) in 0.720 seconds\n", + "\n", + "Ordered by: cumulative time\n", + "\n", + "ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.001 0.001 0.721 0.721 cprof_example.py:1()\n", + " 100 0.003 0.000 0.586 0.006 linalg.py:702(eigvals)\n", + " 200 0.572 0.003 0.572 0.003 {numpy.linalg.lapack_lite.dgeev}\n", + " 1 0.002 0.002 0.075 0.075 __init__.py:106()\n", + " 100 0.059 0.001 0.059 0.001 {method 'randn')\n", + " 1 0.000 0.000 0.044 0.044 add_newdocs.py:9()\n", + " 2 0.001 0.001 0.037 0.019 __init__.py:1()\n", + " 2 0.003 0.002 0.030 0.015 __init__.py:2()\n", + " 1 0.000 0.000 0.030 0.030 type_check.py:3()\n", + " 1 0.001 0.001 0.021 0.021 __init__.py:15()\n", + " 1 0.013 0.013 0.013 0.013 numeric.py:1()\n", + " 1 0.000 0.000 0.009 0.009 __init__.py:6()\n", + " 1 0.001 0.001 0.008 0.008 __init__.py:45()\n", + " 262 0.005 0.000 0.007 0.000 function_base.py:3178(add_newdoc)\n", + " 100 0.003 0.000 0.005 0.000 linalg.py:162(_assertFinite)\n", + " ..." + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [4]: %prun -l 7 -s cumulative run_experiment()\n", + " 4203 function calls in 0.643 seconds\n", + "\n", + "Ordered by: cumulative time\n", + "List reduced from 32 to 7 due to restriction <7>\n", + "\n", + "ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.000 0.000 0.643 0.643 :1()\n", + " 1 0.001 0.001 0.643 0.643 cprof_example.py:4(run_experiment)\n", + " 100 0.003 0.000 0.583 0.006 linalg.py:702(eigvals)\n", + " 200 0.569 0.003 0.569 0.003 {numpy.linalg.lapack_lite.dgeev}\n", + " 100 0.058 0.001 0.058 0.001 {method 'randn'}\n", + " 100 0.003 0.000 0.005 0.000 linalg.py:162(_assertFinite)\n", + " 200 0.002 0.000 0.002 0.000 {method 'all' of 'numpy.ndarray'}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Profiling a Function Line by Line" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# A list of dotted module names of IPython extensions to load.\n", + "c.TerminalIPythonApp.extensions = ['line_profiler']" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "%load_ext line_profiler" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from numpy.random import randn\n", + "\n", + "def add_and_sum(x, y):\n", + " added = x + y\n", + " summed = added.sum(axis=1)\n", + " return summed\n", + "\n", + "def call_function():\n", + " x = randn(1000, 1000)\n", + " y = randn(1000, 1000)\n", + " return add_and_sum(x, y)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [569]: %run prof_mod\n", + "\n", + "In [570]: x = randn(3000, 3000)\n", + "\n", + "In [571]: y = randn(3000, 3000)\n", + "\n", + "In [572]: %prun add_and_sum(x, y)\n", + " 4 function calls in 0.049 seconds\n", + " Ordered by: internal time\n", + " ncalls tottime percall cumtime percall filename:lineno(function)\n", + " 1 0.036 0.036 0.046 0.046 prof_mod.py:3(add_and_sum)\n", + " 1 0.009 0.009 0.009 0.009 {method 'sum' of 'numpy.ndarray'}\n", + " 1 0.003 0.003 0.049 0.049 :1()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "%lprun -f func1 -f func2 " + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [573]: %lprun -f add_and_sum add_and_sum(x, y)\n", + "Timer unit: 1e-06 s\n", + "File: prof_mod.py\n", + "Function: add_and_sum at line 3\n", + "Total time: 0.045936 s\n", + "Line # Hits Time Per Hit % Time Line Contents\n", + "==============================================================\n", + " 3 def add_and_sum(x, y):\n", + " 4 1 36510 36510.0 79.5 added = x + y\n", + " 5 1 9425 9425.0 20.5 summed = added.sum(axis=1)\n", + " 6 1 1 1.0 0.0 return summed" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [574]: %lprun -f add_and_sum -f call_function call_function()\n", + "Timer unit: 1e-06 s\n", + "File: prof_mod.py\n", + "Function: add_and_sum at line 3\n", + "Total time: 0.005526 s\n", + "Line # Hits Time Per Hit % Time Line Contents\n", + "==============================================================\n", + " 3 def add_and_sum(x, y):\n", + " 4 1 4375 4375.0 79.2 added = x + y\n", + " 5 1 1149 1149.0 20.8 summed = added.sum(axis=1)\n", + " 6 1 2 2.0 0.0 return summed\n", + "File: prof_mod.py\n", + "Function: call_function at line 8\n", + "Total time: 0.121016 s\n", + "Line # Hits Time Per Hit % Time Line Contents\n", + "==============================================================\n", + " 8 def call_function():\n", + " 9 1 57169 57169.0 47.2 x = randn(1000, 1000)\n", + " 10 1 58304 58304.0 48.2 y = randn(1000, 1000)\n", + " 11 1 5543 5543.0 4.6 return add_and_sum(x, y)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Tips for Productive Code Development Using IPython" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Reloading Module Dependencies" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "import some_lib\n", + "\n", + "x = 5\n", + "y = [1, 2, 3, 4]\n", + "result = some_lib.get_answer(x, y)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "import some_lib\n", + "import importlib\n", + "\n", + "importlib.reload(some_lib)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Code Design Tips" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Keep relevant objects and data alive" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from my_functions import g\n", + "\n", + "def f(x, y):\n", + " return g(x + y)\n", + "\n", + "def main():\n", + " x = 6\n", + " y = 7.5\n", + " result = x + y\n", + "\n", + "if __name__ == '__main__':\n", + " main()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Flat is better than nested" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Overcome a fear of longer files" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Advanced IPython Features" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Making Your Own Classes IPython-Friendly" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "class Message:\n", + " def __init__(self, msg):\n", + " self.msg = msg" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [576]: x = Message('I have a secret')\n", + "\n", + "In [577]: x\n", + "Out[577]: <__main__.Message instance at 0x60ebbd8>" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "class Message:\n", + " def __init__(self, msg):\n", + " self.msg = msg\n", + "\n", + " def __repr__(self):\n", + " return 'Message: %s' % self.msg" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [579]: x = Message('I have a secret')\n", + "\n", + "In [580]: x\n", + "Out[580]: Message: I have a secret" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Profiles and Configuration" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "/home/wesm/.ipython/profile_default/ipython_config.py" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ipython profile create" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ipython profile create secret_project" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ ipython --profile=secret_project\n", + "Python 3.5.1 | packaged by conda-forge | (default, May 20 2016, 05:22:56)\n", + "Type \"copyright\", \"credits\" or \"license\" for more information.\n", + "\n", + "IPython 5.1.0 -- An enhanced Interactive Python.\n", + "? -> Introduction and overview of IPython's features.\n", + "%quickref -> Quick reference.\n", + "help -> Python's own help system.\n", + "object? -> Details about 'object', use 'object??' for extra details.\n", + "\n", + "IPython profile: secret_project" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "jupyter notebook --generate-config" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ mv ~/.jupyter/jupyter_notebook_config.py ~/.jupyter/my_custom_config.py" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "jupyter notebook --config=~/.jupyter/my_custom_config.py" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_appb" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "More on the IPython System" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Using the Command History" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Searching and Reusing the Command History" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In[7]: %run first/second/third/data_script.py" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [1]: a_command = foo(x, y, z)", - "", - "(reverse-i-search)`com': a_command = foo(x, y, z)" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Input and Output Variables" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [24]: 2 ** 27", - "Out[24]: 134217728", - "", - "In [25]: _", - "Out[25]: 134217728" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [26]: foo = 'bar'", - "", - "In [27]: foo", - "Out[27]: 'bar'", - "", - "In [28]: _i27", - "Out[28]: u'foo'", - "", - "In [29]: _27", - "Out[29]: 'bar'" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [30]: exec(_i27)" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Interacting with the Operating System" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Shell Commands and Aliases" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [1]: ip_info = !ifconfig wlan0 | grep \"inet \"", - "", - "In [2]: ip_info[0].strip()", - "Out[2]: 'inet addr:10.0.0.11 Bcast:10.0.0.255 Mask:255.255.255.0'" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [3]: foo = 'test*'", - "", - "In [4]: !ls $foo", - "test4.py test.py test.xml" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [1]: %alias ll ls -l", - "", - "In [2]: ll /usr", - "total 332", - "drwxr-xr-x 2 root root 69632 2012-01-29 20:36 bin/", - "drwxr-xr-x 2 root root 4096 2010-08-23 12:05 games/", - "drwxr-xr-x 123 root root 20480 2011-12-26 18:08 include/", - "drwxr-xr-x 265 root root 126976 2012-01-29 20:36 lib/", - "drwxr-xr-x 44 root root 69632 2011-12-26 18:08 lib32/", - "lrwxrwxrwx 1 root root 3 2010-08-23 16:02 lib64 -> lib/", - "drwxr-xr-x 15 root root 4096 2011-10-13 19:03 local/", - "drwxr-xr-x 2 root root 12288 2012-01-12 09:32 sbin/", - "drwxr-xr-x 387 root root 12288 2011-11-04 22:53 share/", - "drwxrwsr-x 24 root src 4096 2011-07-17 18:38 src/" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [558]: %alias test_alias (cd examples; ls; cd ..)", - "", - "In [559]: test_alias", - "macrodata.csv spx.csv tips.csv" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Directory Bookmark System" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [6]: %bookmark py4da /home/wesm/code/pydata-book" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [7]: cd py4da", - "(bookmark:py4da) -> /home/wesm/code/pydata-book", - "/home/wesm/code/pydata-book" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [8]: %bookmark -l", - "Current bookmarks:", - "py4da -> /home/wesm/code/pydata-book-source" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Software Development Tools" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Interactive Debugger" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [2]: run examples/ipython_bug.py", - "---------------------------------------------------------------------------", - "AssertionError Traceback (most recent call last)", - "/home/wesm/code/pydata-book/examples/ipython_bug.py in ()", - " 13 throws_an_exception()", - " 14", - "---> 15 calling_things()", - "", - "/home/wesm/code/pydata-book/examples/ipython_bug.py in calling_things()", - " 11 def calling_things():", - " 12 works_fine()", - "---> 13 throws_an_exception()", - " 14", - " 15 calling_things()", - "", - "/home/wesm/code/pydata-book/examples/ipython_bug.py in throws_an_exception()", - " 7 a = 5", - " 8 b = 6", - "----> 9 assert(a + b == 10)", - " 10", - " 11 def calling_things():", - "", - "AssertionError:", - "", - "In [3]: %debug", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(9)throws_an_exception()", - " 8 b = 6", - "----> 9 assert(a + b == 10)", - " 10", - "", - "ipdb>" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ipdb> u", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(13)calling_things()", - " 12 works_fine()", - "---> 13 throws_an_exception()", - " 14" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [5]: run -d examples/ipython_bug.py", - "Breakpoint 1 at /home/wesm/code/pydata-book/examples/ipython_bug.py:1", - "NOTE: Enter 'c' at the ipdb> prompt to start your script.", - "> (1)()", - "", - "ipdb> s", - "--Call--", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(1)()", - "1---> 1 def works_fine():", - " 2 a = 5", - " 3 b = 6" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ipdb> b 12", - "ipdb> c", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(12)calling_things()", - " 11 def calling_things():", - "2--> 12 works_fine()", - " 13 throws_an_exception()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ipdb> n", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(13)calling_things()", - "2 12 works_fine()", - "---> 13 throws_an_exception()", - " 14" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ipdb> s", - "--Call--", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(6)throws_an_exception()", - " 5", - "----> 6 def throws_an_exception():", - " 7 a = 5", - "", - "ipdb> n", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(7)throws_an_exception()", - " 6 def throws_an_exception():", - "----> 7 a = 5", - " 8 b = 6", - "", - "ipdb> n", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(8)throws_an_exception()", - " 7 a = 5", - "----> 8 b = 6", - " 9 assert(a + b == 10)", - "", - "ipdb> n", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(9)throws_an_exception()", - " 8 b = 6", - "----> 9 assert(a + b == 10)", - " 10", - "", - "ipdb> !a", - "5", - "ipdb> !b", - "6" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Other ways to make use of the debugger" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from IPython.core.debugger import Pdb", - "", - "def set_trace():", - " Pdb(color_scheme='Linux').set_trace(sys._getframe().f_back)", - "", - "def debug(f, *args, **kwargs):", - " pdb = Pdb(color_scheme='Linux')", - " return pdb.runcall(f, *args, **kwargs)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [7]: run examples/ipython_bug.py", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(16)calling_things()", - " 15 set_trace()", - "---> 16 throws_an_exception()", - " 17" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def f(x, y, z=1):", - " tmp = x + y", - " return tmp / z" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [6]: debug(f, 1, 2, z=3)", - "> (2)f()", - " 1 def f(x, y, z):", - "----> 2 tmp = x + y", - " 3 return tmp / z", - "", - "ipdb>" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [1]: %run -d examples/ipython_bug.py", - "Breakpoint 1 at /home/wesm/code/pydata-book/examples/ipython_bug.py:1", - "NOTE: Enter 'c' at the ipdb> prompt to start your script.", - "> (1)()", - "", - "ipdb>" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [2]: %run -d -b2 examples/ipython_bug.py", - "Breakpoint 1 at /home/wesm/code/pydata-book/examples/ipython_bug.py:2", - "NOTE: Enter 'c' at the ipdb> prompt to start your script.", - "> (1)()", - "", - "ipdb> c", - "> /home/wesm/code/pydata-book/examples/ipython_bug.py(2)works_fine()", - " 1 def works_fine():", - "1---> 2 a = 5", - " 3 b = 6", - "", - "ipdb>" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Timing Code: %time and %timeit" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import time", - "start = time.time()", - "for i in range(iterations):", - " # some code to run here", - "elapsed_per = (time.time() - start) / iterations" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "# a very large list of strings", - "strings = ['foo', 'foobar', 'baz', 'qux',", - " 'python', 'Guido Van Rossum'] * 100000", - "", - "method1 = [x for x in strings if x.startswith('foo')]", - "", - "method2 = [x for x in strings if x[:3] == 'foo']" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [561]: %time method1 = [x for x in strings if x.startswith('foo')]", - "CPU times: user 0.19 s, sys: 0.00 s, total: 0.19 s", - "Wall time: 0.19 s", - "", - "In [562]: %time method2 = [x for x in strings if x[:3] == 'foo']", - "CPU times: user 0.09 s, sys: 0.00 s, total: 0.09 s", - "Wall time: 0.09 s" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [563]: %timeit [x for x in strings if x.startswith('foo')]", - "10 loops, best of 3: 159 ms per loop", - "", - "In [564]: %timeit [x for x in strings if x[:3] == 'foo']", - "10 loops, best of 3: 59.3 ms per loop" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [565]: x = 'foobar'", - "", - "In [566]: y = 'foo'", - "", - "In [567]: %timeit x.startswith(y)", - "1000000 loops, best of 3: 267 ns per loop", - "", - "In [568]: %timeit x[:3] == y", - "10000000 loops, best of 3: 147 ns per loop" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Basic Profiling: %prun and %run -p" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import numpy as np", - "from numpy.linalg import eigvals", - "", - "def run_experiment(niter=100):", - " K = 100", - " results = []", - " for _ in xrange(niter):", - " mat = np.random.randn(K, K)", - " max_eigenvalue = np.abs(eigvals(mat)).max()", - " results.append(max_eigenvalue)", - " return results", - "some_results = run_experiment()", - "print 'Largest one we saw: %s' % np.max(some_results)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "python -m cProfile cprof_example.py" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ python -m cProfile -s cumulative cprof_example.py", - "Largest one we saw: 11.923204422", - " 15116 function calls (14927 primitive calls) in 0.720 seconds", - "", - "Ordered by: cumulative time", - "", - "ncalls tottime percall cumtime percall filename:lineno(function)", - " 1 0.001 0.001 0.721 0.721 cprof_example.py:1()", - " 100 0.003 0.000 0.586 0.006 linalg.py:702(eigvals)", - " 200 0.572 0.003 0.572 0.003 {numpy.linalg.lapack_lite.dgeev}", - " 1 0.002 0.002 0.075 0.075 __init__.py:106()", - " 100 0.059 0.001 0.059 0.001 {method 'randn')", - " 1 0.000 0.000 0.044 0.044 add_newdocs.py:9()", - " 2 0.001 0.001 0.037 0.019 __init__.py:1()", - " 2 0.003 0.002 0.030 0.015 __init__.py:2()", - " 1 0.000 0.000 0.030 0.030 type_check.py:3()", - " 1 0.001 0.001 0.021 0.021 __init__.py:15()", - " 1 0.013 0.013 0.013 0.013 numeric.py:1()", - " 1 0.000 0.000 0.009 0.009 __init__.py:6()", - " 1 0.001 0.001 0.008 0.008 __init__.py:45()", - " 262 0.005 0.000 0.007 0.000 function_base.py:3178(add_newdoc)", - " 100 0.003 0.000 0.005 0.000 linalg.py:162(_assertFinite)", - " ..." - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [4]: %prun -l 7 -s cumulative run_experiment()", - " 4203 function calls in 0.643 seconds", - "", - "Ordered by: cumulative time", - "List reduced from 32 to 7 due to restriction <7>", - "", - "ncalls tottime percall cumtime percall filename:lineno(function)", - " 1 0.000 0.000 0.643 0.643 :1()", - " 1 0.001 0.001 0.643 0.643 cprof_example.py:4(run_experiment)", - " 100 0.003 0.000 0.583 0.006 linalg.py:702(eigvals)", - " 200 0.569 0.003 0.569 0.003 {numpy.linalg.lapack_lite.dgeev}", - " 100 0.058 0.001 0.058 0.001 {method 'randn'}", - " 100 0.003 0.000 0.005 0.000 linalg.py:162(_assertFinite)", - " 200 0.002 0.000 0.002 0.000 {method 'all' of 'numpy.ndarray'}" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Profiling a Function Line by Line" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "# A list of dotted module names of IPython extensions to load.", - "c.TerminalIPythonApp.extensions = ['line_profiler']" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "%load_ext line_profiler" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from numpy.random import randn", - "", - "def add_and_sum(x, y):", - " added = x + y", - " summed = added.sum(axis=1)", - " return summed", - "", - "def call_function():", - " x = randn(1000, 1000)", - " y = randn(1000, 1000)", - " return add_and_sum(x, y)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [569]: %run prof_mod", - "", - "In [570]: x = randn(3000, 3000)", - "", - "In [571]: y = randn(3000, 3000)", - "", - "In [572]: %prun add_and_sum(x, y)", - " 4 function calls in 0.049 seconds", - " Ordered by: internal time", - " ncalls tottime percall cumtime percall filename:lineno(function)", - " 1 0.036 0.036 0.046 0.046 prof_mod.py:3(add_and_sum)", - " 1 0.009 0.009 0.009 0.009 {method 'sum' of 'numpy.ndarray'}", - " 1 0.003 0.003 0.049 0.049 :1()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "%lprun -f func1 -f func2 " - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [573]: %lprun -f add_and_sum add_and_sum(x, y)", - "Timer unit: 1e-06 s", - "File: prof_mod.py", - "Function: add_and_sum at line 3", - "Total time: 0.045936 s", - "Line # Hits Time Per Hit % Time Line Contents", - "==============================================================", - " 3 def add_and_sum(x, y):", - " 4 1 36510 36510.0 79.5 added = x + y", - " 5 1 9425 9425.0 20.5 summed = added.sum(axis=1)", - " 6 1 1 1.0 0.0 return summed" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [574]: %lprun -f add_and_sum -f call_function call_function()", - "Timer unit: 1e-06 s", - "File: prof_mod.py", - "Function: add_and_sum at line 3", - "Total time: 0.005526 s", - "Line # Hits Time Per Hit % Time Line Contents", - "==============================================================", - " 3 def add_and_sum(x, y):", - " 4 1 4375 4375.0 79.2 added = x + y", - " 5 1 1149 1149.0 20.8 summed = added.sum(axis=1)", - " 6 1 2 2.0 0.0 return summed", - "File: prof_mod.py", - "Function: call_function at line 8", - "Total time: 0.121016 s", - "Line # Hits Time Per Hit % Time Line Contents", - "==============================================================", - " 8 def call_function():", - " 9 1 57169 57169.0 47.2 x = randn(1000, 1000)", - " 10 1 58304 58304.0 48.2 y = randn(1000, 1000)", - " 11 1 5543 5543.0 4.6 return add_and_sum(x, y)" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Tips for Productive Code Development Using IPython" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reloading Module Dependencies" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import some_lib", - "", - "x = 5", - "y = [1, 2, 3, 4]", - "result = some_lib.get_answer(x, y)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import some_lib", - "import importlib", - "", - "importlib.reload(some_lib)" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Code Design Tips" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Keep relevant objects and data alive" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from my_functions import g", - "", - "def f(x, y):", - " return g(x + y)", - "", - "def main():", - " x = 6", - " y = 7.5", - " result = x + y", - "", - "if __name__ == '__main__':", - " main()" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Flat is better than nested" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Overcome a fear of longer files" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Advanced IPython Features" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Making Your Own Classes IPython-Friendly" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "class Message:", - " def __init__(self, msg):", - " self.msg = msg" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [576]: x = Message('I have a secret')", - "", - "In [577]: x", - "Out[577]: <__main__.Message instance at 0x60ebbd8>" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "class Message:", - " def __init__(self, msg):", - " self.msg = msg", - "", - " def __repr__(self):", - " return 'Message: %s' % self.msg" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [579]: x = Message('I have a secret')", - "", - "In [580]: x", - "Out[580]: Message: I have a secret" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Profiles and Configuration" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "/home/wesm/.ipython/profile_default/ipython_config.py" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ipython profile create" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ipython profile create secret_project" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ ipython --profile=secret_project", - "Python 3.5.1 | packaged by conda-forge | (default, May 20 2016, 05:22:56)", - "Type \"copyright\", \"credits\" or \"license\" for more information.", - "", - "IPython 5.1.0 -- An enhanced Interactive Python.", - "? -> Introduction and overview of IPython's features.", - "%quickref -> Quick reference.", - "help -> Python's own help system.", - "object? -> Details about 'object', use 'object??' for extra details.", - "", - "IPython profile: secret_project" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "jupyter notebook --generate-config" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ mv ~/.jupyter/jupyter_notebook_config.py ~/.jupyter/my_custom_config.py" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "jupyter notebook --config=~/.jupyter/my_custom_config.py" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch01.ipynb b/ch01.ipynb deleted file mode 100644 index 0d16f66e6..000000000 --- a/ch01.ipynb +++ /dev/null @@ -1,316 +0,0 @@ -{ - "metadata": { - "name": "generated_ch01" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Preliminaries" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "What Is This Book About?" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "What Kinds of Data?" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Why Python for Data Analysis?" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Python as Glue" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Solving the \u201cTwo-Language\u201d Problem" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Why Not Python?" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Essential Python Libraries" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "NumPy" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "pandas" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "matplotlib" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "IPython and Jupyter" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "SciPy" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "scikit-learn" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "statsmodels" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Installation and Setup" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Windows" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "C:\\Users\\wesm>python", - "Python 3.5.2 |Anaconda 4.1.1 (64-bit)| (default, Jul 5 2016, 11:41:13)", - "[MSC v.1900 64 bit (AMD64)] on win32", - ">>>" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Apple (OS X, macOS)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ ipython" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "GNU/Linux" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ bash Anaconda3-4.1.0-Linux-x86_64.sh" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "export PATH=/home/$USER/anaconda/bin:$PATH" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Installing or Updating Python Packages" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "conda install " - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "pip install " - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "conda update " - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "pip install --upgrade " - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Python 2 and Python 3" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Integrated Development Environments (IDEs) and Text\n Editors" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Community and Conferences" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Navigating This Book" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Code Examples" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [5]: CODE EXAMPLE", - "Out[5]: OUTPUT" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Data for Examples" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Import Conventions" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import numpy as np", - "import matplotlib.pyplot as plt", - "import pandas as pd", - "import seaborn as sns", - "import statsmodels as sm" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Jargon" - ] - } - ], - "metadata": {} - } - ] -} \ No newline at end of file diff --git a/ch02.ipynb b/ch02.ipynb index 7b7adeffb..35bb2204d 100644 --- a/ch02.ipynb +++ b/ch02.ipynb @@ -1,1668 +1,2092 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Python Language Basics, IPython, and Jupyter Notebooks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "np.random.seed(12345)\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## The Python Interpreter" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ python\n", + "Python 3.6.0 | packaged by conda-forge | (default, Jan 13 2017, 23:17:12)\n", + "[GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] on linux\n", + "Type \"help\", \"copyright\", \"credits\" or \"license\" for more information.\n", + ">>> a = 5\n", + ">>> print(a)\n", + "5" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "print('Hello world')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ python hello_world.py\n", + "Hello world" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ ipython\n", + "Python 3.6.0 | packaged by conda-forge | (default, Jan 13 2017, 23:17:12)\n", + "Type \"copyright\", \"credits\" or \"license\" for more information.\n", + "\n", + "IPython 5.1.0 -- An enhanced Interactive Python.\n", + "? -> Introduction and overview of IPython's features.\n", + "%quickref -> Quick reference.\n", + "help -> Python's own help system.\n", + "object? -> Details about 'object', use 'object??' for extra details.\n", + "\n", + "In [1]: %run hello_world.py\n", + "Hello world\n", + "\n", + "In [2]:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## IPython Basics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Running the IPython Shell" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "data = {i : np.random.randn() for i in range(7)}\n", + "data" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + ">>> from numpy.random import randn\n", + ">>> data = {i : randn() for i in range(7)}\n", + ">>> print(data)\n", + "{0: -1.5948255432744511, 1: 0.10569006472787983, 2: 1.972367135977295,\n", + "3: 0.15455217573074576, 4: -0.24058577449429575, 5: -1.2904897053651216,\n", + "6: 0.3308507317325902}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Running the Jupyter Notebook" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "$ jupyter notebook\n", + "[I 15:20:52.739 NotebookApp] Serving notebooks from local directory:\n", + "/home/wesm/code/pydata-book\n", + "[I 15:20:52.739 NotebookApp] 0 active kernels\n", + "[I 15:20:52.739 NotebookApp] The Jupyter Notebook is running at:\n", + "http://localhost:8888/\n", + "[I 15:20:52.740 NotebookApp] Use Control-C to stop this server and shut down\n", + "all kernels (twice to skip confirmation).\n", + "Created new window in existing browser session." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Tab Completion" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [1]: an_apple = 27\n", + "\n", + "In [2]: an_example = 42\n", + "\n", + "In [3]: an" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [3]: b = [1, 2, 3]\n", + "\n", + "In [4]: b." + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [1]: import datetime\n", + "\n", + "In [2]: datetime." + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [7]: datasets/movielens/" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Introspection" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [8]: b = [1, 2, 3]\n", + "\n", + "In [9]: b?\n", + "Type: list\n", + "String Form:[1, 2, 3]\n", + "Length: 3\n", + "Docstring:\n", + "list() -> new empty list\n", + "list(iterable) -> new list initialized from iterable's items\n", + "\n", + "In [10]: print?\n", + "Docstring:\n", + "print(value, ..., sep=' ', end='\\n', file=sys.stdout, flush=False)\n", + "\n", + "Prints the values to a stream, or to sys.stdout by default.\n", + "Optional keyword arguments:\n", + "file: a file-like object (stream); defaults to the current sys.stdout.\n", + "sep: string inserted between values, default a space.\n", + "end: string appended after the last value, default a newline.\n", + "flush: whether to forcibly flush the stream.\n", + "Type: builtin_function_or_method" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def add_numbers(a, b):\n", + " \"\"\"\n", + " Add two numbers together\n", + "\n", + " Returns\n", + " -------\n", + " the_sum : type of arguments\n", + " \"\"\"\n", + " return a + b" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [11]: add_numbers?\n", + "Signature: add_numbers(a, b)\n", + "Docstring:\n", + "Add two numbers together\n", + "\n", + "Returns\n", + "-------\n", + "the_sum : type of arguments\n", + "File: \n", + "Type: function" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [12]: add_numbers??\n", + "Signature: add_numbers(a, b)\n", + "Source:\n", + "def add_numbers(a, b):\n", + " \"\"\"\n", + " Add two numbers together\n", + "\n", + " Returns\n", + " -------\n", + " the_sum : type of arguments\n", + " \"\"\"\n", + " return a + b\n", + "File: \n", + "Type: function" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [13]: np.*load*?\n", + "np.__loader__\n", + "np.load\n", + "np.loads\n", + "np.loadtxt\n", + "np.pkgload" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### The %run Command" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def f(x, y, z):\n", + " return (x + y) / z\n", + "\n", + "a = 5\n", + "b = 6\n", + "c = 7.5\n", + "\n", + "result = f(a, b, c)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [14]: %run ipython_script_test.py" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [15]: c\n", + "Out [15]: 7.5\n", + "\n", + "In [16]: result\n", + "Out[16]: 1.4666666666666666" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + ">>> %load ipython_script_test.py\n", + "\n", + " def f(x, y, z):\n", + " return (x + y) / z\n", + "\n", + " a = 5\n", + " b = 6\n", + " c = 7.5\n", + "\n", + " result = f(a, b, c)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Interrupting running code" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Executing Code from the Clipboard" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "x = 5\n", + "y = 7\n", + "if x > 5:\n", + " x += 1\n", + "\n", + " y = 8" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [17]: %paste\n", + "x = 5\n", + "y = 7\n", + "if x > 5:\n", + " x += 1\n", + "\n", + " y = 8\n", + "## -- End pasted text --" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [18]: %cpaste\n", + "Pasting code; enter '--' alone on the line to stop or use Ctrl-D.\n", + ":x = 5\n", + ":y = 7\n", + ":if x > 5:\n", + ": x += 1\n", + ":\n", + ": y = 8\n", + ":--" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Terminal Keyboard Shortcuts" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### About Magic Commands" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [20]: a = np.random.randn(100, 100)\n", + "\n", + "In [20]: %timeit np.dot(a, a)\n", + "10000 loops, best of 3: 20.9 µs per loop" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [21]: %debug?\n", + "Docstring:\n", + "::\n", + "\n", + " %debug [--breakpoint FILE:LINE] [statement [statement ...]]\n", + "\n", + "Activate the interactive debugger.\n", + "\n", + "This magic command support two ways of activating debugger.\n", + "One is to activate debugger before executing code. This way, you\n", + "can set a break point, to step through the code from the point.\n", + "You can use this mode by giving statements to execute and optionally\n", + "a breakpoint.\n", + "\n", + "The other one is to activate debugger in post-mortem mode. You can\n", + "activate this mode simply running %debug without any argument.\n", + "If an exception has just occurred, this lets you inspect its stack\n", + "frames interactively. Note that this will always work only on the last\n", + "traceback that occurred, so you must call this quickly after an\n", + "exception that you wish to inspect has fired, because if another one\n", + "occurs, it clobbers the previous one.\n", + "\n", + "If you want IPython to automatically do this on every exception, see\n", + "the %pdb magic for more details.\n", + "\n", + "positional arguments:\n", + " statement Code to run in debugger. You can omit this in cell\n", + " magic mode.\n", + "\n", + "optional arguments:\n", + " --breakpoint , -b \n", + " Set break point at LINE in FILE." + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [22]: %pwd\n", + "Out[22]: '/home/wesm/code/pydata-book\n", + "\n", + "In [23]: foo = %pwd\n", + "\n", + "In [24]: foo\n", + "Out[24]: '/home/wesm/code/pydata-book'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Matplotlib Integration" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [26]: %matplotlib\n", + "Using matplotlib backend: Qt4Agg" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [26]: %matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Python Language Basics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Language Semantics" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Indentation, not braces" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "for x in array:\n", + " if x < pivot:\n", + " less.append(x)\n", + " else:\n", + " greater.append(x)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "a = 5; b = 6; c = 7" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Everything is an object" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Comments" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "results = []\n", + "for line in file_handle:\n", + " # keep the empty lines for now\n", + " # if len(line) == 0:\n", + " # continue\n", + " results.append(line.replace('foo', 'bar'))" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "print(\"Reached this line\") # Simple status report" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Function and object method calls" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "result = f(x, y, z)\n", + "g()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "obj.some_method(x, y, z)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "result = f(a, b, c, d=5, e='foo')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Variables and argument passing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = [1, 2, 3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "b = a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a.append(4)\n", + "b" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def append_element(some_list, element):\n", + " some_list.append(element)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [27]: data = [1, 2, 3]\n", + "\n", + "In [28]: append_element(data, 4)\n", + "\n", + "In [29]: data\n", + "Out[29]: [1, 2, 3, 4]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Dynamic references, strong types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 5\n", + "type(a)\n", + "a = 'foo'\n", + "type(a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "'5' + 5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 4.5\n", + "b = 2\n", + "# String formatting, to be visited later\n", + "print('a is {0}, b is {1}'.format(type(a), type(b)))\n", + "a / b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 5\n", + "isinstance(a, int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 5; b = 4.5\n", + "isinstance(a, (int, float))\n", + "isinstance(b, (int, float))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Attributes and methods" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [1]: a = 'foo'\n", + "\n", + "In [2]: a.\n", + "a.capitalize a.format a.isupper a.rindex a.strip\n", + "a.center a.index a.join a.rjust a.swapcase\n", + "a.count a.isalnum a.ljust a.rpartition a.title\n", + "a.decode a.isalpha a.lower a.rsplit a.translate\n", + "a.encode a.isdigit a.lstrip a.rstrip a.upper\n", + "a.endswith a.islower a.partition a.split a.zfill\n", + "a.expandtabs a.isspace a.replace a.splitlines\n", + "a.find a.istitle a.rfind a.startswith" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 'foo'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "getattr(a, 'split')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Duck typing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def isiterable(obj):\n", + " try:\n", + " iter(obj)\n", + " return True\n", + " except TypeError: # not iterable\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "isiterable('a string')\n", + "isiterable([1, 2, 3])\n", + "isiterable(5)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "if not isinstance(x, list) and isiterable(x):\n", + " x = list(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Imports" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# some_module.py\n", + "PI = 3.14159\n", + "\n", + "def f(x):\n", + " return x + 2\n", + "\n", + "def g(a, b):\n", + " return a + b" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "import some_module\n", + "result = some_module.f(5)\n", + "pi = some_module.PI" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from some_module import f, g, PI\n", + "result = g(5, PI)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "import some_module as sm\n", + "from some_module import PI as pi, g as gf\n", + "\n", + "r1 = sm.f(pi)\n", + "r2 = gf(6, pi)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Binary operators and comparisons" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "5 - 7\n", + "12 + 21.5\n", + "5 <= 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = [1, 2, 3]\n", + "b = a\n", + "c = list(a)\n", + "a is b\n", + "a is not c" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a == c" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = None\n", + "a is None" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Mutable and immutable objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a_list = ['foo', 2, [4, 5]]\n", + "a_list[2] = (3, 4)\n", + "a_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a_tuple = (3, 5, (4, 5))\n", + "a_tuple[1] = 'four'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Scalar Types" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Numeric types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ival = 17239871\n", + "ival ** 6" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fval = 7.243\n", + "fval2 = 6.78e-5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "3 / 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "3 // 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Strings" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "a = 'one way of writing a string'\n", + "b = \"another way\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "c = \"\"\"\n", + "This is a longer string that\n", + "spans multiple lines\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "c.count('\\n')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 'this is a string'\n", + "a[10] = 'f'\n", + "b = a.replace('string', 'longer string')\n", + "b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 5.6\n", + "s = str(a)\n", + "print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s = 'python'\n", + "list(s)\n", + "s[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s = '12\\\\34'\n", + "print(s)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s = r'this\\has\\no\\special\\characters'\n", + "s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 'this is the first half '\n", + "b = 'and this is the second half'\n", + "a + b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "template = '{0:.2f} {1:s} are worth US${2:d}'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "template.format(4.5560, 'Argentine Pesos', 1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Bytes and Unicode" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val = \"español\"\n", + "val" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val_utf8 = val.encode('utf-8')\n", + "val_utf8\n", + "type(val_utf8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val_utf8.decode('utf-8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val.encode('latin1')\n", + "val.encode('utf-16')\n", + "val.encode('utf-16le')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bytes_val = b'this is bytes'\n", + "bytes_val\n", + "decoded = bytes_val.decode('utf8')\n", + "decoded # this is str (Unicode) now" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Booleans" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "True and True\n", + "False or True" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Type casting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s = '3.14159'\n", + "fval = float(s)\n", + "type(fval)\n", + "int(fval)\n", + "bool(fval)\n", + "bool(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = None\n", + "a is None\n", + "b = 5\n", + "b is not None" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def add_and_maybe_multiply(a, b, c=None):\n", + " result = a + b\n", + "\n", + " if c is not None:\n", + " result = result * c\n", + "\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "type(None)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Dates and times" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from datetime import datetime, date, time\n", + "dt = datetime(2011, 10, 29, 20, 30, 21)\n", + "dt.day\n", + "dt.minute" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dt.date()\n", + "dt.time()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dt.strftime('%m/%d/%Y %H:%M')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "datetime.strptime('20091031', '%Y%m%d')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dt.replace(minute=0, second=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dt2 = datetime(2011, 11, 15, 22, 30)\n", + "delta = dt2 - dt\n", + "delta\n", + "type(delta)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dt\n", + "dt + delta" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Control Flow" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### if, elif, and else" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "if x < 0:\n", + " print('It's negative')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "if x < 0:\n", + " print('It's negative')\n", + "elif x == 0:\n", + " print('Equal to zero')\n", + "elif 0 < x < 5:\n", + " print('Positive but smaller than 5')\n", + "else:\n", + " print('Positive and larger than or equal to 5')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = 5; b = 7\n", + "c = 8; d = 4\n", + "if a < b or c > d:\n", + " print('Made it')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "4 > 3 > 2 > 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### for loops" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "for value in collection:\n", + " # do something with value" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "sequence = [1, 2, None, 4, None, 5]\n", + "total = 0\n", + "for value in sequence:\n", + " if value is None:\n", + " continue\n", + " total += value" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "sequence = [1, 2, 0, 4, 6, 5, 2, 1]\n", + "total_until_5 = 0\n", + "for value in sequence:\n", + " if value == 5:\n", + " break\n", + " total_until_5 += value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for i in range(4):\n", + " for j in range(4):\n", + " if j > i:\n", + " break\n", + " print((i, j))" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "for a, b, c in iterator:\n", + " # do something" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### while loops" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "x = 256\n", + "total = 0\n", + "while x > 0:\n", + " if total > 500:\n", + " break\n", + " total += x\n", + " x = x // 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### pass" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "if x < 0:\n", + " print('negative!')\n", + "elif x == 0:\n", + " # TODO: put something smart here\n", + " pass\n", + "else:\n", + " print('positive!')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### range" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "range(10)\n", + "list(range(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "list(range(0, 20, 2))\n", + "list(range(5, 0, -1))" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "seq = [1, 2, 3, 4]\n", + "for i in range(len(seq)):\n", + " val = seq[i]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "sum = 0\n", + "for i in range(100000):\n", + " # % is the modulo operator\n", + " if i % 3 == 0 or i % 5 == 0:\n", + " sum += i" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Ternary expressions" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "value = " + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "if " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "x = 5\n", + "'Non-negative' if x >= 0 else 'Negative'" + ] + } + ], "metadata": { - "name": "generated_ch02" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Python Language Basics, IPython, and Jupyter Notebooks" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "np.random.seed(12345)", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "The Python Interpreter" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ python", - "Python 3.6.0 | packaged by conda-forge | (default, Jan 13 2017, 23:17:12)", - "[GCC 4.8.2 20140120 (Red Hat 4.8.2-15)] on linux", - "Type \"help\", \"copyright\", \"credits\" or \"license\" for more information.", - ">>> a = 5", - ">>> print(a)", - "5" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "print('Hello world')" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ python hello_world.py", - "Hello world" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ ipython", - "Python 3.6.0 | packaged by conda-forge | (default, Jan 13 2017, 23:17:12)", - "Type \"copyright\", \"credits\" or \"license\" for more information.", - "", - "IPython 5.1.0 -- An enhanced Interactive Python.", - "? -> Introduction and overview of IPython's features.", - "%quickref -> Quick reference.", - "help -> Python's own help system.", - "object? -> Details about 'object', use 'object??' for extra details.", - "", - "In [1]: %run hello_world.py", - "Hello world", - "", - "In [2]:" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "IPython Basics" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Running the IPython Shell" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import numpy as np", - "data = {i : np.random.randn() for i in range(7)}", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - ">>> from numpy.random import randn", - ">>> data = {i : randn() for i in range(7)}", - ">>> print(data)", - "{0: -1.5948255432744511, 1: 0.10569006472787983, 2: 1.972367135977295,", - "3: 0.15455217573074576, 4: -0.24058577449429575, 5: -1.2904897053651216,", - "6: 0.3308507317325902}" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Running the Jupyter Notebook" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "$ jupyter notebook", - "[I 15:20:52.739 NotebookApp] Serving notebooks from local directory:", - "/home/wesm/code/pydata-book", - "[I 15:20:52.739 NotebookApp] 0 active kernels", - "[I 15:20:52.739 NotebookApp] The Jupyter Notebook is running at:", - "http://localhost:8888/", - "[I 15:20:52.740 NotebookApp] Use Control-C to stop this server and shut down", - "all kernels (twice to skip confirmation).", - "Created new window in existing browser session." - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Tab Completion" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [1]: an_apple = 27", - "", - "In [2]: an_example = 42", - "", - "In [3]: an" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [3]: b = [1, 2, 3]", - "", - "In [4]: b." - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [1]: import datetime", - "", - "In [2]: datetime." - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [7]: datasets/movielens/" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Introspection" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [8]: b = [1, 2, 3]", - "", - "In [9]: b?", - "Type: list", - "String Form:[1, 2, 3]", - "Length: 3", - "Docstring:", - "list() -> new empty list", - "list(iterable) -> new list initialized from iterable's items", - "", - "In [10]: print?", - "Docstring:", - "print(value, ..., sep=' ', end='\\n', file=sys.stdout, flush=False)", - "", - "Prints the values to a stream, or to sys.stdout by default.", - "Optional keyword arguments:", - "file: a file-like object (stream); defaults to the current sys.stdout.", - "sep: string inserted between values, default a space.", - "end: string appended after the last value, default a newline.", - "flush: whether to forcibly flush the stream.", - "Type: builtin_function_or_method" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def add_numbers(a, b):", - " \"\"\"", - " Add two numbers together", - "", - " Returns", - " -------", - " the_sum : type of arguments", - " \"\"\"", - " return a + b" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [11]: add_numbers?", - "Signature: add_numbers(a, b)", - "Docstring:", - "Add two numbers together", - "", - "Returns", - "-------", - "the_sum : type of arguments", - "File: ", - "Type: function" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [12]: add_numbers??", - "Signature: add_numbers(a, b)", - "Source:", - "def add_numbers(a, b):", - " \"\"\"", - " Add two numbers together", - "", - " Returns", - " -------", - " the_sum : type of arguments", - " \"\"\"", - " return a + b", - "File: ", - "Type: function" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [13]: np.*load*?", - "np.__loader__", - "np.load", - "np.loads", - "np.loadtxt", - "np.pkgload" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "The %run Command" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def f(x, y, z):", - " return (x + y) / z", - "", - "a = 5", - "b = 6", - "c = 7.5", - "", - "result = f(a, b, c)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [14]: %run ipython_script_test.py" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [15]: c", - "Out [15]: 7.5", - "", - "In [16]: result", - "Out[16]: 1.4666666666666666" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - ">>> %load ipython_script_test.py", - "", - " def f(x, y, z):", - " return (x + y) / z", - "", - " a = 5", - " b = 6", - " c = 7.5", - "", - " result = f(a, b, c)" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Interrupting running code" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Executing Code from the Clipboard" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "x = 5", - "y = 7", - "if x > 5:", - " x += 1", - "", - " y = 8" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [17]: %paste", - "x = 5", - "y = 7", - "if x > 5:", - " x += 1", - "", - " y = 8", - "## -- End pasted text --" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [18]: %cpaste", - "Pasting code; enter '--' alone on the line to stop or use Ctrl-D.", - ":x = 5", - ":y = 7", - ":if x > 5:", - ": x += 1", - ":", - ": y = 8", - ":--" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Terminal Keyboard Shortcuts" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "About Magic Commands" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [20]: a = np.random.randn(100, 100)", - "", - "In [20]: %timeit np.dot(a, a)", - "10000 loops, best of 3: 20.9 \u00b5s per loop" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [21]: %debug?", - "Docstring:", - "::", - "", - " %debug [--breakpoint FILE:LINE] [statement [statement ...]]", - "", - "Activate the interactive debugger.", - "", - "This magic command support two ways of activating debugger.", - "One is to activate debugger before executing code. This way, you", - "can set a break point, to step through the code from the point.", - "You can use this mode by giving statements to execute and optionally", - "a breakpoint.", - "", - "The other one is to activate debugger in post-mortem mode. You can", - "activate this mode simply running %debug without any argument.", - "If an exception has just occurred, this lets you inspect its stack", - "frames interactively. Note that this will always work only on the last", - "traceback that occurred, so you must call this quickly after an", - "exception that you wish to inspect has fired, because if another one", - "occurs, it clobbers the previous one.", - "", - "If you want IPython to automatically do this on every exception, see", - "the %pdb magic for more details.", - "", - "positional arguments:", - " statement Code to run in debugger. You can omit this in cell", - " magic mode.", - "", - "optional arguments:", - " --breakpoint , -b ", - " Set break point at LINE in FILE." - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [22]: %pwd", - "Out[22]: '/home/wesm/code/pydata-book", - "", - "In [23]: foo = %pwd", - "", - "In [24]: foo", - "Out[24]: '/home/wesm/code/pydata-book'" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Matplotlib Integration" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [26]: %matplotlib", - "Using matplotlib backend: Qt4Agg" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [26]: %matplotlib inline" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Python Language Basics" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Language Semantics" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Indentation, not braces" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "for x in array:", - " if x < pivot:", - " less.append(x)", - " else:", - " greater.append(x)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "a = 5; b = 6; c = 7" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Everything is an object" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Comments" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "results = []", - "for line in file_handle:", - " # keep the empty lines for now", - " # if len(line) == 0:", - " # continue", - " results.append(line.replace('foo', 'bar'))" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "print(\"Reached this line\") # Simple status report" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Function and object method calls" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "result = f(x, y, z)", - "g()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "obj.some_method(x, y, z)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "result = f(a, b, c, d=5, e='foo')" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Variables and argument passing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = [1, 2, 3]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b = a" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a.append(4)", - "b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def append_element(some_list, element):", - " some_list.append(element)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [27]: data = [1, 2, 3]", - "", - "In [28]: append_element(data, 4)", - "", - "In [29]: data", - "Out[29]: [1, 2, 3, 4]" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Dynamic references, strong types" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 5", - "type(a)", - "a = 'foo'", - "type(a)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'5' + 5" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 4.5", - "b = 2", - "# String formatting, to be visited later", - "print('a is {0}, b is {1}'.format(type(a), type(b)))", - "a / b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 5", - "isinstance(a, int)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 5; b = 4.5", - "isinstance(a, (int, float))", - "isinstance(b, (int, float))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Attributes and methods" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [1]: a = 'foo'", - "", - "In [2]: a.", - "a.capitalize a.format a.isupper a.rindex a.strip", - "a.center a.index a.join a.rjust a.swapcase", - "a.count a.isalnum a.ljust a.rpartition a.title", - "a.decode a.isalpha a.lower a.rsplit a.translate", - "a.encode a.isdigit a.lstrip a.rstrip a.upper", - "a.endswith a.islower a.partition a.split a.zfill", - "a.expandtabs a.isspace a.replace a.splitlines", - "a.find a.istitle a.rfind a.startswith" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 'foo'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "getattr(a, 'split')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Duck typing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def isiterable(obj):", - " try:", - " iter(obj)", - " return True", - " except TypeError: # not iterable", - " return False" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "isiterable('a string')", - "isiterable([1, 2, 3])", - "isiterable(5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "if not isinstance(x, list) and isiterable(x):", - " x = list(x)" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Imports" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "# some_module.py", - "PI = 3.14159", - "", - "def f(x):", - " return x + 2", - "", - "def g(a, b):", - " return a + b" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import some_module", - "result = some_module.f(5)", - "pi = some_module.PI" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from some_module import f, g, PI", - "result = g(5, PI)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import some_module as sm", - "from some_module import PI as pi, g as gf", - "", - "r1 = sm.f(pi)", - "r2 = gf(6, pi)" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Binary operators and comparisons" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "5 - 7", - "12 + 21.5", - "5 <= 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = [1, 2, 3]", - "b = a", - "c = list(a)", - "a is b", - "a is not c" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a == c" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = None", - "a is None" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Mutable and immutable objects" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a_list = ['foo', 2, [4, 5]]", - "a_list[2] = (3, 4)", - "a_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a_tuple = (3, 5, (4, 5))", - "a_tuple[1] = 'four'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Scalar Types" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Numeric types" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ival = 17239871", - "ival ** 6" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fval = 7.243", - "fval2 = 6.78e-5" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "3 / 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "3 // 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Strings" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "a = 'one way of writing a string'", - "b = \"another way\"" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "c = \"\"\"", - "This is a longer string that", - "spans multiple lines", - "\"\"\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "c.count('\\n')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 'this is a string'", - "a[10] = 'f'", - "b = a.replace('string', 'longer string')", - "b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 5.6", - "s = str(a)", - "print(s)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = 'python'", - "list(s)", - "s[:3]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = '12\\\\34'", - "print(s)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = r'this\\has\\no\\special\\characters'", - "s" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 'this is the first half '", - "b = 'and this is the second half'", - "a + b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "template = '{0:.2f} {1:s} are worth US${2:d}'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "template.format(4.5560, 'Argentine Pesos', 1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Bytes and Unicode" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val = \"espa\u00f1ol\"", - "val" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val_utf8 = val.encode('utf-8')", - "val_utf8", - "type(val_utf8)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val_utf8.decode('utf-8')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.encode('latin1')", - "val.encode('utf-16')", - "val.encode('utf-16le')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bytes_val = b'this is bytes'", - "bytes_val", - "decoded = bytes_val.decode('utf8')", - "decoded # this is str (Unicode) now" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Booleans" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "True and True", - "False or True" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Type casting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = '3.14159'", - "fval = float(s)", - "type(fval)", - "int(fval)", - "bool(fval)", - "bool(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "None" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = None", - "a is None", - "b = 5", - "b is not None" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def add_and_maybe_multiply(a, b, c=None):", - " result = a + b", - "", - " if c is not None:", - " result = result * c", - "", - " return result" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "type(None)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Dates and times" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime, date, time", - "dt = datetime(2011, 10, 29, 20, 30, 21)", - "dt.day", - "dt.minute" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dt.date()", - "dt.time()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dt.strftime('%m/%d/%Y %H:%M')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "datetime.strptime('20091031', '%Y%m%d')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dt.replace(minute=0, second=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dt2 = datetime(2011, 11, 15, 22, 30)", - "delta = dt2 - dt", - "delta", - "type(delta)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dt", - "dt + delta" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Control Flow" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "if, elif, and else" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "if x < 0:", - " print('It's negative')" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "if x < 0:", - " print('It's negative')", - "elif x == 0:", - " print('Equal to zero')", - "elif 0 < x < 5:", - " print('Positive but smaller than 5')", - "else:", - " print('Positive and larger than or equal to 5')" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = 5; b = 7", - "c = 8; d = 4", - "if a < b or c > d:", - " print('Made it')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "4 > 3 > 2 > 1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "for loops" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "for value in collection:", - " # do something with value" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "sequence = [1, 2, None, 4, None, 5]", - "total = 0", - "for value in sequence:", - " if value is None:", - " continue", - " total += value" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "sequence = [1, 2, 0, 4, 6, 5, 2, 1]", - "total_until_5 = 0", - "for value in sequence:", - " if value == 5:", - " break", - " total_until_5 += value" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for i in range(4):", - " for j in range(4):", - " if j > i:", - " break", - " print((i, j))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "for a, b, c in iterator:", - " # do something" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "while loops" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "x = 256", - "total = 0", - "while x > 0:", - " if total > 500:", - " break", - " total += x", - " x = x // 2" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "pass" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "if x < 0:", - " print('negative!')", - "elif x == 0:", - " # TODO: put something smart here", - " pass", - "else:", - " print('positive!')" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "range" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "range(10)", - "list(range(10))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "list(range(0, 20, 2))", - "list(range(5, 0, -1))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "seq = [1, 2, 3, 4]", - "for i in range(len(seq)):", - " val = seq[i]" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "sum = 0", - "for i in range(100000):", - " # % is the modulo operator", - " if i % 3 == 0 or i % 5 == 0:", - " sum += i" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Ternary expressions" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "value = " - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "if " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x = 5", - "'Non-negative' if x >= 0 else 'Negative'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch03.ipynb b/ch03.ipynb index 97f3b288f..4509fb8b3 100644 --- a/ch03.ipynb +++ b/ch03.ipynb @@ -1,1947 +1,2464 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Built-in Data Structures, Functions, " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Data Structures and Sequences" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Tuple" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tup = 4, 5, 6\n", + "tup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "nested_tup = (4, 5, 6), (7, 8)\n", + "nested_tup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tuple([4, 0, 2])\n", + "tup = tuple('string')\n", + "tup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tup[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tup = tuple(['foo', [1, 2], True])\n", + "tup[2] = False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tup[1].append(3)\n", + "tup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "(4, None, 'foo') + (6, 0) + ('bar',)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "('foo', 'bar') * 4" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Unpacking tuples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tup = (4, 5, 6)\n", + "a, b, c = tup\n", + "b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tup = 4, 5, (6, 7)\n", + "a, b, (c, d) = tup\n", + "d" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "tmp = a\n", + "a = b\n", + "b = tmp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a, b = 1, 2\n", + "a\n", + "b\n", + "b, a = a, b\n", + "a\n", + "b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]\n", + "for a, b, c in seq:\n", + " print('a={0}, b={1}, c={2}'.format(a, b, c))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "values = 1, 2, 3, 4, 5\n", + "a, b, *rest = values\n", + "a, b\n", + "rest" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a, b, *_ = values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Tuple methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = (1, 2, 2, 2, 3, 4, 2)\n", + "a.count(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### List" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a_list = [2, 3, 7, None]\n", + "tup = ('foo', 'bar', 'baz')\n", + "b_list = list(tup)\n", + "b_list\n", + "b_list[1] = 'peekaboo'\n", + "b_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "gen = range(10)\n", + "gen\n", + "list(gen)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Adding and removing elements" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "b_list.append('dwarf')\n", + "b_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "b_list.insert(1, 'red')\n", + "b_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "b_list.pop(2)\n", + "b_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "b_list.append('foo')\n", + "b_list\n", + "b_list.remove('foo')\n", + "b_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "'dwarf' in b_list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "'dwarf' not in b_list" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Concatenating and combining lists" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "[4, None, 'foo'] + [7, 8, (2, 3)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "x = [4, None, 'foo']\n", + "x.extend([7, 8, (2, 3)])\n", + "x" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "everything = []\n", + "for chunk in list_of_lists:\n", + " everything.extend(chunk)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "everything = []\n", + "for chunk in list_of_lists:\n", + " everything = everything + chunk" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Sorting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = [7, 2, 5, 1, 3]\n", + "a.sort()\n", + "a" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "b = ['saw', 'small', 'He', 'foxes', 'six']\n", + "b.sort(key=len)\n", + "b" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Binary search and maintaining a sorted list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import bisect\n", + "c = [1, 2, 2, 2, 3, 4, 7]\n", + "bisect.bisect(c, 2)\n", + "bisect.bisect(c, 5)\n", + "bisect.insort(c, 6)\n", + "c" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Slicing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq = [7, 2, 3, 7, 5, 6, 0, 1]\n", + "seq[1:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq[3:4] = [6, 3]\n", + "seq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq[:5]\n", + "seq[3:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq[-4:]\n", + "seq[-6:-2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq[::2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq[::-1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Built-in Sequence Functions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### enumerate" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "i = 0\n", + "for value in collection:\n", + " # do something with value\n", + " i += 1" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "for i, value in enumerate(collection):\n", + " # do something with value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "some_list = ['foo', 'bar', 'baz']\n", + "mapping = {}\n", + "for i, v in enumerate(some_list):\n", + " mapping[v] = i\n", + "mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### sorted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sorted([7, 1, 2, 6, 0, 3, 2])\n", + "sorted('horse race')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq1 = ['foo', 'bar', 'baz']\n", + "seq2 = ['one', 'two', 'three']\n", + "zipped = zip(seq1, seq2)\n", + "list(zipped)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "seq3 = [False, True]\n", + "list(zip(seq1, seq2, seq3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for i, (a, b) in enumerate(zip(seq1, seq2)):\n", + " print('{0}: {1}, {2}'.format(i, a, b))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens'),\n", + " ('Schilling', 'Curt')]\n", + "first_names, last_names = zip(*pitchers)\n", + "first_names\n", + "last_names" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### reversed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "list(reversed(range(10)))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "empty_dict = {}\n", + "d1 = {'a' : 'some value', 'b' : [1, 2, 3, 4]}\n", + "d1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "d1[7] = 'an integer'\n", + "d1\n", + "d1['b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "'b' in d1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "d1[5] = 'some value'\n", + "d1\n", + "d1['dummy'] = 'another value'\n", + "d1\n", + "del d1[5]\n", + "d1\n", + "ret = d1.pop('dummy')\n", + "ret\n", + "d1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "list(d1.keys())\n", + "list(d1.values())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "d1.update({'b' : 'foo', 'c' : 12})\n", + "d1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Creating dicts from sequences" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "mapping = {}\n", + "for key, value in zip(key_list, value_list):\n", + " mapping[key] = value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mapping = dict(zip(range(5), reversed(range(5))))\n", + "mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Default values" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "if key in some_dict:\n", + " value = some_dict[key]\n", + "else:\n", + " value = default_value" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "value = some_dict.get(key, default_value)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "words = ['apple', 'bat', 'bar', 'atom', 'book']\n", + "by_letter = {}\n", + "for word in words:\n", + " letter = word[0]\n", + " if letter not in by_letter:\n", + " by_letter[letter] = [word]\n", + " else:\n", + " by_letter[letter].append(word)\n", + "by_letter" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "for word in words:\n", + " letter = word[0]\n", + " by_letter.setdefault(letter, []).append(word)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from collections import defaultdict\n", + "by_letter = defaultdict(list)\n", + "for word in words:\n", + " by_letter[word[0]].append(word)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Valid dict key types" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "hash('string')\n", + "hash((1, 2, (2, 3)))\n", + "hash((1, 2, [2, 3])) # fails because lists are mutable" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "d = {}\n", + "d[tuple([1, 2, 3])] = 5\n", + "d" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "set([2, 2, 2, 1, 3, 3])\n", + "{2, 2, 2, 1, 3, 3}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = {1, 2, 3, 4, 5}\n", + "b = {3, 4, 5, 6, 7, 8}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a.union(b)\n", + "a | b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a.intersection(b)\n", + "a & b" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "c = a.copy()\n", + "c |= b\n", + "c\n", + "d = a.copy()\n", + "d &= b\n", + "d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "my_data = [1, 2, 3, 4]\n", + "my_set = {tuple(my_data)}\n", + "my_set" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a_set = {1, 2, 3, 4, 5}\n", + "{1, 2, 3}.issubset(a_set)\n", + "a_set.issuperset({1, 2, 3})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "{1, 2, 3} == {3, 2, 1}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### List, Set, and Dict Comprehensions" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "[" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "result = []\n", + "for val in collection:\n", + " if " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "strings = ['a', 'as', 'bat', 'car', 'dove', 'python']\n", + "[x.upper() for x in strings if len(x) > 2]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "dict_comp = {" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "set_comp = {" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "unique_lengths = {len(x) for x in strings}\n", + "unique_lengths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "set(map(len, strings))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "loc_mapping = {val : index for index, val in enumerate(strings)}\n", + "loc_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Nested list comprehensions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "all_data = [['John', 'Emily', 'Michael', 'Mary', 'Steven'],\n", + " ['Maria', 'Juan', 'Javier', 'Natalia', 'Pilar']]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "names_of_interest = []\n", + "for names in all_data:\n", + " enough_es = [name for name in names if name.count('e') >= 2]\n", + " names_of_interest.extend(enough_es)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = [name for names in all_data for name in names\n", + " if name.count('e') >= 2]\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]\n", + "flattened = [x for tup in some_tuples for x in tup]\n", + "flattened" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "flattened = []\n", + "\n", + "for tup in some_tuples:\n", + " for x in tup:\n", + " flattened.append(x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "[[x for x in tup] for tup in some_tuples]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Functions" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def my_function(x, y, z=1.5):\n", + " if z > 1:\n", + " return z * (x + y)\n", + " else:\n", + " return z / (x + y)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "my_function(5, 6, z=0.7)\n", + "my_function(3.14, 7, 3.5)\n", + "my_function(10, 20)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Namespaces, Scope, and Local Functions" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def func():\n", + " a = []\n", + " for i in range(5):\n", + " a.append(i)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "a = []\n", + "def func():\n", + " for i in range(5):\n", + " a.append(i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = None\n", + "def bind_a_variable():\n", + " global a\n", + " a = []\n", + "bind_a_variable()\n", + "print(a)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Returning Multiple Values" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def f():\n", + " a = 5\n", + " b = 6\n", + " c = 7\n", + " return a, b, c\n", + "\n", + "a, b, c = f()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "return_value = f()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def f():\n", + " a = 5\n", + " b = 6\n", + " c = 7\n", + " return {'a' : a, 'b' : b, 'c' : c}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Functions Are Objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', 'FlOrIda',\n", + " 'south carolina##', 'West virginia?']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import re\n", + "\n", + "def clean_strings(strings):\n", + " result = []\n", + " for value in strings:\n", + " value = value.strip()\n", + " value = re.sub('[!#?]', '', value)\n", + " value = value.title()\n", + " result.append(value)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "clean_strings(states)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def remove_punctuation(value):\n", + " return re.sub('[!#?]', '', value)\n", + "\n", + "clean_ops = [str.strip, remove_punctuation, str.title]\n", + "\n", + "def clean_strings(strings, ops):\n", + " result = []\n", + " for value in strings:\n", + " for function in ops:\n", + " value = function(value)\n", + " result.append(value)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "clean_strings(states, clean_ops)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for x in map(remove_punctuation, states):\n", + " print(x)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Anonymous (Lambda) Functions" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def short_function(x):\n", + " return x * 2\n", + "\n", + "equiv_anon = lambda x: x * 2" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def apply_to_list(some_list, f):\n", + " return [f(x) for x in some_list]\n", + "\n", + "ints = [4, 0, 1, 5, 6]\n", + "apply_to_list(ints, lambda x: x * 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "strings = ['foo', 'card', 'bar', 'aaaa', 'abab']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "strings.sort(key=lambda x: len(set(list(x))))\n", + "strings" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Currying: Partial Argument Application" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def add_numbers(x, y):\n", + " return x + y" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "add_five = lambda y: add_numbers(5, y)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from functools import partial\n", + "add_five = partial(add_numbers, 5)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Generators" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "some_dict = {'a': 1, 'b': 2, 'c': 3}\n", + "for key in some_dict:\n", + " print(key)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dict_iterator = iter(some_dict)\n", + "dict_iterator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "list(dict_iterator)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def squares(n=10):\n", + " print('Generating squares from 1 to {0}'.format(n ** 2))\n", + " for i in range(1, n + 1):\n", + " yield i ** 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "gen = squares()\n", + "gen" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for x in gen:\n", + " print(x, end=' ')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Generator expresssions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "gen = (x ** 2 for x in range(100))\n", + "gen" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def _make_gen():\n", + " for x in range(100):\n", + " yield x ** 2\n", + "gen = _make_gen()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sum(x ** 2 for x in range(100))\n", + "dict((i, i **2) for i in range(5))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### itertools module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import itertools\n", + "first_letter = lambda x: x[0]\n", + "names = ['Alan', 'Adam', 'Wes', 'Will', 'Albert', 'Steven']\n", + "for letter, names in itertools.groupby(names, first_letter):\n", + " print(letter, list(names)) # names is a generator" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Errors and Exception Handling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "float('1.2345')\n", + "float('something')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def attempt_float(x):\n", + " try:\n", + " return float(x)\n", + " except:\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "attempt_float('1.2345')\n", + "attempt_float('something')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "float((1, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def attempt_float(x):\n", + " try:\n", + " return float(x)\n", + " except ValueError:\n", + " return x" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "attempt_float((1, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def attempt_float(x):\n", + " try:\n", + " return float(x)\n", + " except (TypeError, ValueError):\n", + " return x" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "f = open(path, 'w')\n", + "\n", + "try:\n", + " write_to_file(f)\n", + "finally:\n", + " f.close()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "f = open(path, 'w')\n", + "\n", + "try:\n", + " write_to_file(f)\n", + "except:\n", + " print('Failed')\n", + "else:\n", + " print('Succeeded')\n", + "finally:\n", + " f.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Exceptions in IPython" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [10]: %run examples/ipython_bug.py\n", + "---------------------------------------------------------------------------\n", + "AssertionError Traceback (most recent call last)\n", + "/home/wesm/code/pydata-book/examples/ipython_bug.py in ()\n", + " 13 throws_an_exception()\n", + " 14\n", + "---> 15 calling_things()\n", + "\n", + "/home/wesm/code/pydata-book/examples/ipython_bug.py in calling_things()\n", + " 11 def calling_things():\n", + " 12 works_fine()\n", + "---> 13 throws_an_exception()\n", + " 14\n", + " 15 calling_things()\n", + "\n", + "/home/wesm/code/pydata-book/examples/ipython_bug.py in throws_an_exception()\n", + " 7 a = 5\n", + " 8 b = 6\n", + "----> 9 assert(a + b == 10)\n", + " 10\n", + " 11 def calling_things():\n", + "\n", + "AssertionError:" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Files and the Operating System" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "%pushd book-materials" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "path = 'examples/segismundo.txt'\n", + "f = open(path)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "for line in f:\n", + " pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "lines = [x.rstrip() for x in open(path)]\n", + "lines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "f.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with open(path) as f:\n", + " lines = [x.rstrip() for x in f]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "f = open(path)\n", + "f.read(10)\n", + "f2 = open(path, 'rb') # Binary mode\n", + "f2.read(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "f.tell()\n", + "f2.tell()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.getdefaultencoding()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "f.seek(3)\n", + "f.read(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "f.close()\n", + "f2.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with open('tmp.txt', 'w') as handle:\n", + " handle.writelines(x for x in open(path) if len(x) > 1)\n", + "with open('tmp.txt') as f:\n", + " lines = f.readlines()\n", + "lines" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import os\n", + "os.remove('tmp.txt')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Bytes and Unicode with Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with open(path) as f:\n", + " chars = f.read(10)\n", + "chars" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with open(path, 'rb') as f:\n", + " data = f.read(10)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.decode('utf8')\n", + "data[:4].decode('utf8')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sink_path = 'sink.txt'\n", + "with open(path) as source:\n", + " with open(sink_path, 'xt', encoding='iso-8859-1') as sink:\n", + " sink.write(source.read())\n", + "with open(sink_path, encoding='iso-8859-1') as f:\n", + " print(f.read(10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "os.remove(sink_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "f = open(path)\n", + "f.read(5)\n", + "f.seek(4)\n", + "f.read(1)\n", + "f.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "%popd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch03" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Built-in Data Structures, Functions, " - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data Structures and Sequences" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Tuple" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tup = 4, 5, 6", - "tup" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nested_tup = (4, 5, 6), (7, 8)", - "nested_tup" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tuple([4, 0, 2])", - "tup = tuple('string')", - "tup" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tup[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tup = tuple(['foo', [1, 2], True])", - "tup[2] = False" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tup[1].append(3)", - "tup" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "(4, None, 'foo') + (6, 0) + ('bar',)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "('foo', 'bar') * 4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Unpacking tuples" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tup = (4, 5, 6)", - "a, b, c = tup", - "b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tup = 4, 5, (6, 7)", - "a, b, (c, d) = tup", - "d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "tmp = a", - "a = b", - "b = tmp" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a, b = 1, 2", - "a", - "b", - "b, a = a, b", - "a", - "b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]", - "for a, b, c in seq:", - " print('a={0}, b={1}, c={2}'.format(a, b, c))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = 1, 2, 3, 4, 5", - "a, b, *rest = values", - "a, b", - "rest" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a, b, *_ = values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Tuple methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = (1, 2, 2, 2, 3, 4, 2)", - "a.count(2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "List" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a_list = [2, 3, 7, None]", - "tup = ('foo', 'bar', 'baz')", - "b_list = list(tup)", - "b_list", - "b_list[1] = 'peekaboo'", - "b_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "gen = range(10)", - "gen", - "list(gen)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Adding and removing elements" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b_list.append('dwarf')", - "b_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b_list.insert(1, 'red')", - "b_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b_list.pop(2)", - "b_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b_list.append('foo')", - "b_list", - "b_list.remove('foo')", - "b_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'dwarf' in b_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'dwarf' not in b_list" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Concatenating and combining lists" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "[4, None, 'foo'] + [7, 8, (2, 3)]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x = [4, None, 'foo']", - "x.extend([7, 8, (2, 3)])", - "x" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "everything = []", - "for chunk in list_of_lists:", - " everything.extend(chunk)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "everything = []", - "for chunk in list_of_lists:", - " everything = everything + chunk" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Sorting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = [7, 2, 5, 1, 3]", - "a.sort()", - "a" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b = ['saw', 'small', 'He', 'foxes', 'six']", - "b.sort(key=len)", - "b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Binary search and maintaining a sorted list" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import bisect", - "c = [1, 2, 2, 2, 3, 4, 7]", - "bisect.bisect(c, 2)", - "bisect.bisect(c, 5)", - "bisect.insort(c, 6)", - "c" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Slicing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq = [7, 2, 3, 7, 5, 6, 0, 1]", - "seq[1:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq[3:4] = [6, 3]", - "seq" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq[:5]", - "seq[3:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq[-4:]", - "seq[-6:-2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq[::2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq[::-1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Built-in Sequence Functions" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "enumerate" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "i = 0", - "for value in collection:", - " # do something with value", - " i += 1" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "for i, value in enumerate(collection):", - " # do something with value" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "some_list = ['foo', 'bar', 'baz']", - "mapping = {}", - "for i, v in enumerate(some_list):", - " mapping[v] = i", - "mapping" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "sorted" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sorted([7, 1, 2, 6, 0, 3, 2])", - "sorted('horse race')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "zip" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq1 = ['foo', 'bar', 'baz']", - "seq2 = ['one', 'two', 'three']", - "zipped = zip(seq1, seq2)", - "list(zipped)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "seq3 = [False, True]", - "list(zip(seq1, seq2, seq3))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for i, (a, b) in enumerate(zip(seq1, seq2)):", - " print('{0}: {1}, {2}'.format(i, a, b))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pitchers = [('Nolan', 'Ryan'), ('Roger', 'Clemens'),", - " ('Schilling', 'Curt')]", - "first_names, last_names = zip(*pitchers)", - "first_names", - "last_names" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "reversed" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "list(reversed(range(10)))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "dict" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "empty_dict = {}", - "d1 = {'a' : 'some value', 'b' : [1, 2, 3, 4]}", - "d1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "d1[7] = 'an integer'", - "d1", - "d1['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'b' in d1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "d1[5] = 'some value'", - "d1", - "d1['dummy'] = 'another value'", - "d1", - "del d1[5]", - "d1", - "ret = d1.pop('dummy')", - "ret", - "d1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "list(d1.keys())", - "list(d1.values())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "d1.update({'b' : 'foo', 'c' : 12})", - "d1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Creating dicts from sequences" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "mapping = {}", - "for key, value in zip(key_list, value_list):", - " mapping[key] = value" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mapping = dict(zip(range(5), reversed(range(5))))", - "mapping" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Default values" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "if key in some_dict:", - " value = some_dict[key]", - "else:", - " value = default_value" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "value = some_dict.get(key, default_value)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "words = ['apple', 'bat', 'bar', 'atom', 'book']", - "by_letter = {}", - "for word in words:", - " letter = word[0]", - " if letter not in by_letter:", - " by_letter[letter] = [word]", - " else:", - " by_letter[letter].append(word)", - "by_letter" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "for word in words:", - " letter = word[0]", - " by_letter.setdefault(letter, []).append(word)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from collections import defaultdict", - "by_letter = defaultdict(list)", - "for word in words:", - " by_letter[word[0]].append(word)" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Valid dict key types" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "hash('string')", - "hash((1, 2, (2, 3)))", - "hash((1, 2, [2, 3])) # fails because lists are mutable" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "d = {}", - "d[tuple([1, 2, 3])] = 5", - "d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "set" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "set([2, 2, 2, 1, 3, 3])", - "{2, 2, 2, 1, 3, 3}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = {1, 2, 3, 4, 5}", - "b = {3, 4, 5, 6, 7, 8}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a.union(b)", - "a | b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a.intersection(b)", - "a & b" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "c = a.copy()", - "c |= b", - "c", - "d = a.copy()", - "d &= b", - "d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_data = [1, 2, 3, 4]", - "my_set = {tuple(my_data)}", - "my_set" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a_set = {1, 2, 3, 4, 5}", - "{1, 2, 3}.issubset(a_set)", - "a_set.issuperset({1, 2, 3})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "{1, 2, 3} == {3, 2, 1}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "List, Set, and Dict Comprehensions" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "[" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "result = []", - "for val in collection:", - " if " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "strings = ['a', 'as', 'bat', 'car', 'dove', 'python']", - "[x.upper() for x in strings if len(x) > 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "dict_comp = {" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "set_comp = {" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "unique_lengths = {len(x) for x in strings}", - "unique_lengths" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "set(map(len, strings))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "loc_mapping = {val : index for index, val in enumerate(strings)}", - "loc_mapping" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Nested list comprehensions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "all_data = [['John', 'Emily', 'Michael', 'Mary', 'Steven'],", - " ['Maria', 'Juan', 'Javier', 'Natalia', 'Pilar']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "names_of_interest = []", - "for names in all_data:", - " enough_es = [name for name in names if name.count('e') >= 2]", - " names_of_interest.extend(enough_es)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = [name for names in all_data for name in names", - " if name.count('e') >= 2]", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "some_tuples = [(1, 2, 3), (4, 5, 6), (7, 8, 9)]", - "flattened = [x for tup in some_tuples for x in tup]", - "flattened" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "flattened = []", - "", - "for tup in some_tuples:", - " for x in tup:", - " flattened.append(x)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "[[x for x in tup] for tup in some_tuples]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Functions" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def my_function(x, y, z=1.5):", - " if z > 1:", - " return z * (x + y)", - " else:", - " return z / (x + y)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "my_function(5, 6, z=0.7)", - "my_function(3.14, 7, 3.5)", - "my_function(10, 20)" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Namespaces, Scope, and Local Functions" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def func():", - " a = []", - " for i in range(5):", - " a.append(i)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "a = []", - "def func():", - " for i in range(5):", - " a.append(i)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = None", - "def bind_a_variable():", - " global a", - " a = []", - "bind_a_variable()", - "print(a)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Returning Multiple Values" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def f():", - " a = 5", - " b = 6", - " c = 7", - " return a, b, c", - "", - "a, b, c = f()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "return_value = f()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def f():", - " a = 5", - " b = 6", - " c = 7", - " return {'a' : a, 'b' : b, 'c' : c}" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Functions Are Objects" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = [' Alabama ', 'Georgia!', 'Georgia', 'georgia', 'FlOrIda',", - " 'south carolina##', 'West virginia?']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import re", - "", - "def clean_strings(strings):", - " result = []", - " for value in strings:", - " value = value.strip()", - " value = re.sub('[!#?]', '', value)", - " value = value.title()", - " result.append(value)", - " return result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "clean_strings(states)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def remove_punctuation(value):", - " return re.sub('[!#?]', '', value)", - "", - "clean_ops = [str.strip, remove_punctuation, str.title]", - "", - "def clean_strings(strings, ops):", - " result = []", - " for value in strings:", - " for function in ops:", - " value = function(value)", - " result.append(value)", - " return result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "clean_strings(states, clean_ops)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for x in map(remove_punctuation, states):", - " print(x)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Anonymous (Lambda) Functions" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def short_function(x):", - " return x * 2", - "", - "equiv_anon = lambda x: x * 2" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def apply_to_list(some_list, f):", - " return [f(x) for x in some_list]", - "", - "ints = [4, 0, 1, 5, 6]", - "apply_to_list(ints, lambda x: x * 2)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "strings = ['foo', 'card', 'bar', 'aaaa', 'abab']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "strings.sort(key=lambda x: len(set(list(x))))", - "strings" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Currying: Partial Argument Application" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def add_numbers(x, y):", - " return x + y" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "add_five = lambda y: add_numbers(5, y)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from functools import partial", - "add_five = partial(add_numbers, 5)" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Generators" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "some_dict = {'a': 1, 'b': 2, 'c': 3}", - "for key in some_dict:", - " print(key)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dict_iterator = iter(some_dict)", - "dict_iterator" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "list(dict_iterator)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def squares(n=10):", - " print('Generating squares from 1 to {0}'.format(n ** 2))", - " for i in range(1, n + 1):", - " yield i ** 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "gen = squares()", - "gen" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for x in gen:", - " print(x, end=' ')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Generator expresssions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "gen = (x ** 2 for x in range(100))", - "gen" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def _make_gen():", - " for x in range(100):", - " yield x ** 2", - "gen = _make_gen()" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sum(x ** 2 for x in range(100))", - "dict((i, i **2) for i in range(5))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "itertools module" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import itertools", - "first_letter = lambda x: x[0]", - "names = ['Alan', 'Adam', 'Wes', 'Will', 'Albert', 'Steven']", - "for letter, names in itertools.groupby(names, first_letter):", - " print(letter, list(names)) # names is a generator" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Errors and Exception Handling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "float('1.2345')", - "float('something')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def attempt_float(x):", - " try:", - " return float(x)", - " except:", - " return x" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "attempt_float('1.2345')", - "attempt_float('something')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "float((1, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def attempt_float(x):", - " try:", - " return float(x)", - " except ValueError:", - " return x" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "attempt_float((1, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def attempt_float(x):", - " try:", - " return float(x)", - " except (TypeError, ValueError):", - " return x" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "f = open(path, 'w')", - "", - "try:", - " write_to_file(f)", - "finally:", - " f.close()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "f = open(path, 'w')", - "", - "try:", - " write_to_file(f)", - "except:", - " print('Failed')", - "else:", - " print('Succeeded')", - "finally:", - " f.close()" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Exceptions in IPython" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [10]: %run examples/ipython_bug.py", - "---------------------------------------------------------------------------", - "AssertionError Traceback (most recent call last)", - "/home/wesm/code/pydata-book/examples/ipython_bug.py in ()", - " 13 throws_an_exception()", - " 14", - "---> 15 calling_things()", - "", - "/home/wesm/code/pydata-book/examples/ipython_bug.py in calling_things()", - " 11 def calling_things():", - " 12 works_fine()", - "---> 13 throws_an_exception()", - " 14", - " 15 calling_things()", - "", - "/home/wesm/code/pydata-book/examples/ipython_bug.py in throws_an_exception()", - " 7 a = 5", - " 8 b = 6", - "----> 9 assert(a + b == 10)", - " 10", - " 11 def calling_things():", - "", - "AssertionError:" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Files and the Operating System" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "path = 'examples/segismundo.txt'", - "f = open(path)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "for line in f:", - " pass" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "lines = [x.rstrip() for x in open(path)]", - "lines" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f.close()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "with open(path) as f:", - " lines = [x.rstrip() for x in f]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f = open(path)", - "f.read(10)", - "f2 = open(path, 'rb') # Binary mode", - "f2.read(10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f.tell()", - "f2.tell()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import sys", - "sys.getdefaultencoding()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f.seek(3)", - "f.read(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f.close()", - "f2.close()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "with open('tmp.txt', 'w') as handle:", - " handle.writelines(x for x in open(path) if len(x) > 1)", - "with open('tmp.txt') as f:", - " lines = f.readlines()", - "lines" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import os", - "os.remove('tmp.txt')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Bytes and Unicode with Files" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "with open(path) as f:", - " chars = f.read(10)", - "chars" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "with open(path, 'rb') as f:", - " data = f.read(10)", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.decode('utf8')", - "data[:4].decode('utf8')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sink_path = 'sink.txt'", - "with open(path) as source:", - " with open(sink_path, 'xt', encoding='iso-8859-1') as sink:", - " sink.write(source.read())", - "with open(sink_path, encoding='iso-8859-1') as f:", - " print(f.read(10))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "os.remove(sink_path)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f = open(path)", - "f.read(5)", - "f.seek(4)", - "f.read(1)", - "f.close()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch04.ipynb b/ch04.ipynb index 49da25e8a..f1d142ce8 100644 --- a/ch04.ipynb +++ b/ch04.ipynb @@ -1,1480 +1,1856 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# NumPy Basics: Arrays and " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "my_arr = np.arange(1000000)\n", + "my_list = list(range(1000000))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "%time for _ in range(10): my_arr2 = my_arr * 2\n", + "%time for _ in range(10): my_list2 = [x * 2 for x in my_list]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## The NumPy ndarray: A Multidimensional Array Object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "# Generate some random data\n", + "data = np.random.randn(2, 3)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data * 10\n", + "data + data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.shape\n", + "data.dtype" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Creating ndarrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data1 = [6, 7.5, 8, 0, 1]\n", + "arr1 = np.array(data1)\n", + "arr1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]\n", + "arr2 = np.array(data2)\n", + "arr2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2.ndim\n", + "arr2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr1.dtype\n", + "arr2.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.zeros(10)\n", + "np.zeros((3, 6))\n", + "np.empty((2, 3, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.arange(15)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Data Types for ndarrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr1 = np.array([1, 2, 3], dtype=np.float64)\n", + "arr2 = np.array([1, 2, 3], dtype=np.int32)\n", + "arr1.dtype\n", + "arr2.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.array([1, 2, 3, 4, 5])\n", + "arr.dtype\n", + "float_arr = arr.astype(np.float64)\n", + "float_arr.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])\n", + "arr\n", + "arr.astype(np.int32)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)\n", + "numeric_strings.astype(float)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "int_array = np.arange(10)\n", + "calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)\n", + "int_array.astype(calibers.dtype)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "empty_uint32 = np.empty(8, dtype='u4')\n", + "empty_uint32" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Arithmetic with NumPy Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.array([[1., 2., 3.], [4., 5., 6.]])\n", + "arr\n", + "arr * arr\n", + "arr - arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "1 / arr\n", + "arr ** 0.5" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])\n", + "arr2\n", + "arr2 > arr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Basic Indexing and Slicing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "arr\n", + "arr[5]\n", + "arr[5:8]\n", + "arr[5:8] = 12\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr_slice = arr[5:8]\n", + "arr_slice" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr_slice[1] = 12345\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr_slice[:] = 64\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])\n", + "arr2d[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d[0][2]\n", + "arr2d[0, 2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])\n", + "arr3d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr3d[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "old_values = arr3d[0].copy()\n", + "arr3d[0] = 42\n", + "arr3d\n", + "arr3d[0] = old_values\n", + "arr3d" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr3d[1, 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "x = arr3d[1]\n", + "x\n", + "x[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Indexing with slices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr\n", + "arr[1:6]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d\n", + "arr2d[:2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d[:2, 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d[1, :2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d[:2, 2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d[:, :1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr2d[:2, 1:] = 0\n", + "arr2d" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Boolean Indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])\n", + "data = np.random.randn(7, 4)\n", + "names\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names == 'Bob'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[names == 'Bob']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[names == 'Bob', 2:]\n", + "data[names == 'Bob', 3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names != 'Bob'\n", + "data[~(names == 'Bob')]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cond = names == 'Bob'\n", + "data[~cond]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mask = (names == 'Bob') | (names == 'Will')\n", + "mask\n", + "data[mask]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[data < 0] = 0\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[names != 'Joe'] = 7\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Fancy Indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.empty((8, 4))\n", + "for i in range(8):\n", + " arr[i] = i\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr[[4, 3, 0, 6]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr[[-3, -5, -7]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(32).reshape((8, 4))\n", + "arr\n", + "arr[[1, 5, 7, 2], [0, 3, 1, 2]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Transposing Arrays and Swapping Axes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(15).reshape((3, 5))\n", + "arr\n", + "arr.T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(6, 3)\n", + "arr\n", + "np.dot(arr.T, arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(16).reshape((2, 2, 4))\n", + "arr\n", + "arr.transpose((1, 0, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr\n", + "arr.swapaxes(1, 2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Universal Functions: Fast Element-Wise Array Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "arr\n", + "np.sqrt(arr)\n", + "np.exp(arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "x = np.random.randn(8)\n", + "y = np.random.randn(8)\n", + "x\n", + "y\n", + "np.maximum(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(7) * 5\n", + "arr\n", + "remainder, whole_part = np.modf(arr)\n", + "remainder\n", + "whole_part" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr\n", + "np.sqrt(arr)\n", + "np.sqrt(arr, arr)\n", + "arr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Array-Oriented Programming with Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "points = np.arange(-5, 5, 0.01) # 1000 equally spaced points\n", + "xs, ys = np.meshgrid(points, points)\n", + "ys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "z = np.sqrt(xs ** 2 + ys ** 2)\n", + "z" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()\n", + "plt.title(\"Image plot of $\\sqrt{x^2 + y^2}$ for a grid of values\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.draw()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Expressing Conditional Logic as Array Operations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])\n", + "yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])\n", + "cond = np.array([True, False, True, True, False])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = [(x if c else y)\n", + " for x, y, c in zip(xarr, yarr, cond)]\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = np.where(cond, xarr, yarr)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(4, 4)\n", + "arr\n", + "arr > 0\n", + "np.where(arr > 0, 2, -2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.where(arr > 0, 2, arr) # set only positive values to 2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Mathematical and Statistical Methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(5, 4)\n", + "arr\n", + "arr.mean()\n", + "np.mean(arr)\n", + "arr.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr.mean(axis=1)\n", + "arr.sum(axis=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])\n", + "arr.cumsum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])\n", + "arr\n", + "arr.cumsum(axis=0)\n", + "arr.cumprod(axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Methods for Boolean Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(100)\n", + "(arr > 0).sum() # Number of positive values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bools = np.array([False, False, True, False])\n", + "bools.any()\n", + "bools.all()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Sorting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(6)\n", + "arr\n", + "arr.sort()\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.random.randn(5, 3)\n", + "arr\n", + "arr.sort(1)\n", + "arr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "large_arr = np.random.randn(1000)\n", + "large_arr.sort()\n", + "large_arr[int(0.05 * len(large_arr))] # 5% quantile" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Unique and Other Set Logic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])\n", + "np.unique(names)\n", + "ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])\n", + "np.unique(ints)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sorted(set(names))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "values = np.array([6, 0, 0, 3, 2, 5, 6])\n", + "np.in1d(values, [2, 3, 6])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## File Input and Output with Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(10)\n", + "np.save('some_array', arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.load('some_array.npy')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.savez('array_archive.npz', a=arr, b=arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arch = np.load('array_archive.npz')\n", + "arch['b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.savez_compressed('arrays_compressed.npz', a=arr, b=arr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!rm some_array.npy\n", + "!rm array_archive.npz\n", + "!rm arrays_compressed.npz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Linear Algebra" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "x = np.array([[1., 2., 3.], [4., 5., 6.]])\n", + "y = np.array([[6., 23.], [-1, 7], [8, 9]])\n", + "x\n", + "y\n", + "x.dot(y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.dot(x, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.dot(x, np.ones(3))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "x @ np.ones(3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from numpy.linalg import inv, qr\n", + "X = np.random.randn(5, 5)\n", + "mat = X.T.dot(X)\n", + "inv(mat)\n", + "mat.dot(inv(mat))\n", + "q, r = qr(mat)\n", + "r" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Pseudorandom Number Generation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "samples = np.random.normal(size=(4, 4))\n", + "samples" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from random import normalvariate\n", + "N = 1000000\n", + "%timeit samples = [normalvariate(0, 1) for _ in range(N)]\n", + "%timeit np.random.normal(size=N)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.random.seed(1234)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = np.random.RandomState(1234)\n", + "rng.randn(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Example: Random Walks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import random\n", + "position = 0\n", + "walk = [position]\n", + "steps = 1000\n", + "for i in range(steps):\n", + " step = 1 if random.randint(0, 1) else -1\n", + " position += step\n", + " walk.append(position)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.plot(walk[:100])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.random.seed(12345)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "nsteps = 1000\n", + "draws = np.random.randint(0, 2, size=nsteps)\n", + "steps = np.where(draws > 0, 1, -1)\n", + "walk = steps.cumsum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "walk.min()\n", + "walk.max()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "(np.abs(walk) >= 10).argmax()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Simulating Many Random Walks at Once" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "nwalks = 5000\n", + "nsteps = 1000\n", + "draws = np.random.randint(0, 2, size=(nwalks, nsteps)) # 0 or 1\n", + "steps = np.where(draws > 0, 1, -1)\n", + "walks = steps.cumsum(1)\n", + "walks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "walks.max()\n", + "walks.min()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "hits30 = (np.abs(walks) >= 30).any(1)\n", + "hits30\n", + "hits30.sum() # Number that hit 30 or -30" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)\n", + "crossing_times.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "steps = np.random.normal(loc=0, scale=0.25,\n", + " size=(nwalks, nsteps))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch04" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "NumPy Basics: Arrays and " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import numpy as np", - "my_arr = np.arange(1000000)", - "my_list = list(range(1000000))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%time for _ in range(10): my_arr2 = my_arr * 2", - "%time for _ in range(10): my_list2 = [x * 2 for x in my_list]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "The NumPy ndarray: A Multidimensional Array Object" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import numpy as np", - "# Generate some random data", - "data = np.random.randn(2, 3)", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data * 10", - "data + data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.shape", - "data.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Creating ndarrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data1 = [6, 7.5, 8, 0, 1]", - "arr1 = np.array(data1)", - "arr1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data2 = [[1, 2, 3, 4], [5, 6, 7, 8]]", - "arr2 = np.array(data2)", - "arr2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2.ndim", - "arr2.shape" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr1.dtype", - "arr2.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.zeros(10)", - "np.zeros((3, 6))", - "np.empty((2, 3, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.arange(15)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Data Types for ndarrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr1 = np.array([1, 2, 3], dtype=np.float64)", - "arr2 = np.array([1, 2, 3], dtype=np.int32)", - "arr1.dtype", - "arr2.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([1, 2, 3, 4, 5])", - "arr.dtype", - "float_arr = arr.astype(np.float64)", - "float_arr.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([3.7, -1.2, -2.6, 0.5, 12.9, 10.1])", - "arr", - "arr.astype(np.int32)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "numeric_strings = np.array(['1.25', '-9.6', '42'], dtype=np.string_)", - "numeric_strings.astype(float)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "int_array = np.arange(10)", - "calibers = np.array([.22, .270, .357, .380, .44, .50], dtype=np.float64)", - "int_array.astype(calibers.dtype)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "empty_uint32 = np.empty(8, dtype='u4')", - "empty_uint32" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Arithmetic with NumPy Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([[1., 2., 3.], [4., 5., 6.]])", - "arr", - "arr * arr", - "arr - arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "1 / arr", - "arr ** 0.5" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2 = np.array([[0., 4., 1.], [7., 2., 12.]])", - "arr2", - "arr2 > arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Basic Indexing and Slicing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)", - "arr", - "arr[5]", - "arr[5:8]", - "arr[5:8] = 12", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_slice = arr[5:8]", - "arr_slice" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_slice[1] = 12345", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr_slice[:] = 64", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])", - "arr2d[2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[0][2]", - "arr2d[0, 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr3d = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])", - "arr3d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr3d[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "old_values = arr3d[0].copy()", - "arr3d[0] = 42", - "arr3d", - "arr3d[0] = old_values", - "arr3d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr3d[1, 0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x = arr3d[1]", - "x", - "x[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Indexing with slices" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr", - "arr[1:6]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d", - "arr2d[:2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[:2, 1:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[1, :2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[:2, 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[:, :1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr2d[:2, 1:] = 0", - "arr2d" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Boolean Indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])", - "data = np.random.randn(7, 4)", - "names", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names == 'Bob'" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[names == 'Bob']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[names == 'Bob', 2:]", - "data[names == 'Bob', 3]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names != 'Bob'", - "data[~(names == 'Bob')]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cond = names == 'Bob'", - "data[~cond]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mask = (names == 'Bob') | (names == 'Will')", - "mask", - "data[mask]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[data < 0] = 0", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[names != 'Joe'] = 7", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Fancy Indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.empty((8, 4))", - "for i in range(8):", - " arr[i] = i", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[[4, 3, 0, 6]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[[-3, -5, -7]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(32).reshape((8, 4))", - "arr", - "arr[[1, 5, 7, 2], [0, 3, 1, 2]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr[[1, 5, 7, 2]][:, [0, 3, 1, 2]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Transposing Arrays and Swapping Axes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(15).reshape((3, 5))", - "arr", - "arr.T" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(6, 3)", - "arr", - "np.dot(arr.T, arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(16).reshape((2, 2, 4))", - "arr", - "arr.transpose((1, 0, 2))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr", - "arr.swapaxes(1, 2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Universal Functions: Fast Element-Wise Array Functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)", - "arr", - "np.sqrt(arr)", - "np.exp(arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x = np.random.randn(8)", - "y = np.random.randn(8)", - "x", - "y", - "np.maximum(x, y)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(7) * 5", - "arr", - "remainder, whole_part = np.modf(arr)", - "remainder", - "whole_part" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr", - "np.sqrt(arr)", - "np.sqrt(arr, arr)", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Array-Oriented Programming with Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "points = np.arange(-5, 5, 0.01) # 1000 equally spaced points", - "xs, ys = np.meshgrid(points, points)", - "ys" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "z = np.sqrt(xs ** 2 + ys ** 2)", - "z" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import matplotlib.pyplot as plt", - "plt.imshow(z, cmap=plt.cm.gray); plt.colorbar()", - "plt.title(\"Image plot of $\\sqrt{x^2 + y^2}$ for a grid of values\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.draw()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Expressing Conditional Logic as Array Operations" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])", - "yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])", - "cond = np.array([True, False, True, True, False])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = [(x if c else y)", - " for x, y, c in zip(xarr, yarr, cond)]", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = np.where(cond, xarr, yarr)", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(4, 4)", - "arr", - "arr > 0", - "np.where(arr > 0, 2, -2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.where(arr > 0, 2, arr) # set only positive values to 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Mathematical and Statistical Methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(5, 4)", - "arr", - "arr.mean()", - "np.mean(arr)", - "arr.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr.mean(axis=1)", - "arr.sum(axis=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([0, 1, 2, 3, 4, 5, 6, 7])", - "arr.cumsum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])", - "arr", - "arr.cumsum(axis=0)", - "arr.cumprod(axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Methods for Boolean Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(100)", - "(arr > 0).sum() # Number of positive values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bools = np.array([False, False, True, False])", - "bools.any()", - "bools.all()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Sorting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(6)", - "arr", - "arr.sort()", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.random.randn(5, 3)", - "arr", - "arr.sort(1)", - "arr" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "large_arr = np.random.randn(1000)", - "large_arr.sort()", - "large_arr[int(0.05 * len(large_arr))] # 5% quantile" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Unique and Other Set Logic" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])", - "np.unique(names)", - "ints = np.array([3, 3, 3, 2, 2, 1, 1, 4, 4])", - "np.unique(ints)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sorted(set(names))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = np.array([6, 0, 0, 3, 2, 5, 6])", - "np.in1d(values, [2, 3, 6])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "File Input and Output with Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(10)", - "np.save('some_array', arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.load('some_array.npy')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.savez('array_archive.npz', a=arr, b=arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arch = np.load('array_archive.npz')", - "arch['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.savez_compressed('arrays_compressed.npz', a=arr, b=arr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!rm some_array.npy", - "!rm array_archive.npz", - "!rm arrays_compressed.npz" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Linear Algebra" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x = np.array([[1., 2., 3.], [4., 5., 6.]])", - "y = np.array([[6., 23.], [-1, 7], [8, 9]])", - "x", - "y", - "x.dot(y)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.dot(x, y)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.dot(x, np.ones(3))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "x @ np.ones(3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy.linalg import inv, qr", - "X = np.random.randn(5, 5)", - "mat = X.T.dot(X)", - "inv(mat)", - "mat.dot(inv(mat))", - "q, r = qr(mat)", - "r" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Pseudorandom Number Generation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "samples = np.random.normal(size=(4, 4))", - "samples" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from random import normalvariate", - "N = 1000000", - "%timeit samples = [normalvariate(0, 1) for _ in range(N)]", - "%timeit np.random.normal(size=N)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(1234)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = np.random.RandomState(1234)", - "rng.randn(10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Example: Random Walks" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import random", - "position = 0", - "walk = [position]", - "steps = 1000", - "for i in range(steps):", - " step = 1 if random.randint(0, 1) else -1", - " position += step", - " walk.append(position)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.plot(walk[:100])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nsteps = 1000", - "draws = np.random.randint(0, 2, size=nsteps)", - "steps = np.where(draws > 0, 1, -1)", - "walk = steps.cumsum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "walk.min()", - "walk.max()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "(np.abs(walk) >= 10).argmax()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Simulating Many Random Walks at Once" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nwalks = 5000", - "nsteps = 1000", - "draws = np.random.randint(0, 2, size=(nwalks, nsteps)) # 0 or 1", - "steps = np.where(draws > 0, 1, -1)", - "walks = steps.cumsum(1)", - "walks" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "walks.max()", - "walks.min()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "hits30 = (np.abs(walks) >= 30).any(1)", - "hits30", - "hits30.sum() # Number that hit 30 or -30" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "crossing_times = (np.abs(walks[hits30]) >= 30).argmax(1)", - "crossing_times.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "steps = np.random.normal(loc=0, scale=0.25,", - " size=(nwalks, nsteps))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch05.ipynb b/ch05.ipynb index fed4ecad0..559274bb9 100644 --- a/ch05.ipynb +++ b/ch05.ipynb @@ -1,1634 +1,2057 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Getting Started with pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from pandas import Series, DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Introduction to pandas Data Structures" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series([4, 7, -5, 3])\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj.values\n", + "obj.index # like range(4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])\n", + "obj2\n", + "obj2.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj2['a']\n", + "obj2['d'] = 6\n", + "obj2[['c', 'a', 'd']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj2[obj2 > 0]\n", + "obj2 * 2\n", + "np.exp(obj2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "'b' in obj2\n", + "'e' in obj2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}\n", + "obj3 = pd.Series(sdata)\n", + "obj3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "states = ['California', 'Ohio', 'Oregon', 'Texas']\n", + "obj4 = pd.Series(sdata, index=states)\n", + "obj4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.isnull(obj4)\n", + "pd.notnull(obj4)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj4.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj3\n", + "obj4\n", + "obj3 + obj4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj4.name = 'population'\n", + "obj4.index.name = 'state'\n", + "obj4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj\n", + "obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']\n", + "obj" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### DataFrame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],\n", + " 'year': [2000, 2001, 2002, 2001, 2002, 2003],\n", + " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}\n", + "frame = pd.DataFrame(data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.DataFrame(data, columns=['year', 'state', 'pop'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],\n", + " index=['one', 'two', 'three', 'four',\n", + " 'five', 'six'])\n", + "frame2\n", + "frame2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2['state']\n", + "frame2.year" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2.loc['three']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2['debt'] = 16.5\n", + "frame2\n", + "frame2['debt'] = np.arange(6.)\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])\n", + "frame2['debt'] = val\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2['eastern'] = frame2.state == 'Ohio'\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "del frame2['eastern']\n", + "frame2.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pop = {'Nevada': {2001: 2.4, 2002: 2.9},\n", + " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame3 = pd.DataFrame(pop)\n", + "frame3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame3.T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.DataFrame(pop, index=[2001, 2002, 2003])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pdata = {'Ohio': frame3['Ohio'][:-1],\n", + " 'Nevada': frame3['Nevada'][:2]}\n", + "pd.DataFrame(pdata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame3.index.name = 'year'; frame3.columns.name = 'state'\n", + "frame3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame3.values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2.values" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Index Objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series(range(3), index=['a', 'b', 'c'])\n", + "index = obj.index\n", + "index\n", + "index[1:]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "index[1] = 'd' # TypeError" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "labels = pd.Index(np.arange(3))\n", + "labels\n", + "obj2 = pd.Series([1.5, -2.5, 0], index=labels)\n", + "obj2\n", + "obj2.index is labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame3\n", + "frame3.columns\n", + "'Ohio' in frame3.columns\n", + "2003 in frame3.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])\n", + "dup_labels" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Essential Functionality" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Reindexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])\n", + "obj2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])\n", + "obj3\n", + "obj3.reindex(range(6), method='ffill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.arange(9).reshape((3, 3)),\n", + " index=['a', 'c', 'd'],\n", + " columns=['Ohio', 'Texas', 'California'])\n", + "frame\n", + "frame2 = frame.reindex(['a', 'b', 'c', 'd'])\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "states = ['Texas', 'Utah', 'California']\n", + "frame.reindex(columns=states)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.loc[['a', 'b', 'c', 'd'], states]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Dropping Entries from an Axis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])\n", + "obj\n", + "new_obj = obj.drop('c')\n", + "new_obj\n", + "obj.drop(['d', 'c'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n", + " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", + " columns=['one', 'two', 'three', 'four'])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.drop(['Colorado', 'Ohio'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.drop('two', axis=1)\n", + "data.drop(['two', 'four'], axis='columns')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj.drop('c', inplace=True)\n", + "obj" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Indexing, Selection, and Filtering" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])\n", + "obj\n", + "obj['b']\n", + "obj[1]\n", + "obj[2:4]\n", + "obj[['b', 'a', 'd']]\n", + "obj[[1, 3]]\n", + "obj[obj < 2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj['b':'c']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj['b':'c'] = 5\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame(np.arange(16).reshape((4, 4)),\n", + " index=['Ohio', 'Colorado', 'Utah', 'New York'],\n", + " columns=['one', 'two', 'three', 'four'])\n", + "data\n", + "data['two']\n", + "data[['three', 'one']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[:2]\n", + "data[data['three'] > 5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data < 5\n", + "data[data < 5] = 0\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Selection with loc and iloc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.loc['Colorado', ['two', 'three']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.iloc[2, [3, 0, 1]]\n", + "data.iloc[2]\n", + "data.iloc[[1, 2], [3, 0, 1]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.loc[:'Utah', 'two']\n", + "data.iloc[:, :3][data.three > 5]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Integer Indexes" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ser = pd.Series(np.arange(3.))\n", + "ser\n", + "ser[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ser = pd.Series(np.arange(3.))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ser" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])\n", + "ser2[-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ser[:1]\n", + "ser.loc[:1]\n", + "ser.iloc[:1]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Arithmetic and Data Alignment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])\n", + "s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],\n", + " index=['a', 'c', 'e', 'f', 'g'])\n", + "s1\n", + "s2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s1 + s2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),\n", + " index=['Ohio', 'Texas', 'Colorado'])\n", + "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),\n", + " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", + "df1\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 + df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame({'A': [1, 2]})\n", + "df2 = pd.DataFrame({'B': [3, 4]})\n", + "df1\n", + "df2\n", + "df1 - df2" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Arithmetic methods with fill values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),\n", + " columns=list('abcd'))\n", + "df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),\n", + " columns=list('abcde'))\n", + "df2.loc[1, 'b'] = np.nan\n", + "df1\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 + df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1.add(df2, fill_value=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "1 / df1\n", + "df1.rdiv(1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1.reindex(columns=df2.columns, fill_value=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Operations between DataFrame and Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(12.).reshape((3, 4))\n", + "arr\n", + "arr[0]\n", + "arr - arr[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),\n", + " columns=list('bde'),\n", + " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", + "series = frame.iloc[0]\n", + "frame\n", + "series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame - series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "series2 = pd.Series(range(3), index=['b', 'e', 'f'])\n", + "frame + series2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "series3 = frame['d']\n", + "frame\n", + "series3\n", + "frame.sub(series3, axis='index')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Function Application and Mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),\n", + " index=['Utah', 'Ohio', 'Texas', 'Oregon'])\n", + "frame\n", + "np.abs(frame)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "f = lambda x: x.max() - x.min()\n", + "frame.apply(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.apply(f, axis='columns')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def f(x):\n", + " return pd.Series([x.min(), x.max()], index=['min', 'max'])\n", + "frame.apply(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "format = lambda x: '%.2f' % x\n", + "frame.applymap(format)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame['e'].map(format)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Sorting and Ranking" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])\n", + "obj.sort_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.arange(8).reshape((2, 4)),\n", + " index=['three', 'one'],\n", + " columns=['d', 'a', 'b', 'c'])\n", + "frame.sort_index()\n", + "frame.sort_index(axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.sort_index(axis=1, ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series([4, 7, -3, 2])\n", + "obj.sort_values()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])\n", + "obj.sort_values()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})\n", + "frame\n", + "frame.sort_values(by='b')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.sort_values(by=['a', 'b'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series([7, -5, 7, 4, 2, 0, 4])\n", + "obj.rank()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj.rank(method='first')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Assign tie values the maximum rank in the group\n", + "obj.rank(ascending=False, method='max')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],\n", + " 'c': [-2, 5, 8, -2.5]})\n", + "frame\n", + "frame.rank(axis='columns')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Axis Indexes with Duplicate Labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])\n", + "obj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj.index.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj['a']\n", + "obj['c']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])\n", + "df\n", + "df.loc['b']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Summarizing and Computing Descriptive Statistics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],\n", + " [np.nan, np.nan], [0.75, -1.3]],\n", + " index=['a', 'b', 'c', 'd'],\n", + " columns=['one', 'two'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.sum(axis='columns')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.mean(axis='columns', skipna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.idxmax()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.cumsum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series(['a', 'a', 'b', 'c'] * 4)\n", + "obj.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Correlation and Covariance" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "conda install pandas-datareader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "price = pd.read_pickle('examples/yahoo_price.pkl')\n", + "volume = pd.read_pickle('examples/yahoo_volume.pkl')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "import pandas_datareader.data as web\n", + "all_data = {ticker: web.get_data_yahoo(ticker)\n", + " for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}\n", + "\n", + "price = pd.DataFrame({ticker: data['Adj Close']\n", + " for ticker, data in all_data.items()})\n", + "volume = pd.DataFrame({ticker: data['Volume']\n", + " for ticker, data in all_data.items()})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "returns = price.pct_change()\n", + "returns.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "returns['MSFT'].corr(returns['IBM'])\n", + "returns['MSFT'].cov(returns['IBM'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "returns.MSFT.corr(returns.IBM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "returns.corr()\n", + "returns.cov()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "returns.corrwith(returns.IBM)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "returns.corrwith(volume)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Unique Values, Value Counts, and Membership" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "uniques = obj.unique()\n", + "uniques" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.value_counts(obj.values, sort=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj\n", + "mask = obj.isin(['b', 'c'])\n", + "mask\n", + "obj[mask]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])\n", + "unique_vals = pd.Series(['c', 'b', 'a'])\n", + "pd.Index(unique_vals).get_indexer(to_match)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],\n", + " 'Qu2': [2, 3, 1, 2, 3],\n", + " 'Qu3': [1, 5, 2, 4, 4]})\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = data.apply(pd.value_counts).fillna(0)\n", + "result" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + } + ], "metadata": { - "name": "generated_ch05" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Getting Started with pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas as pd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas import Series, DataFrame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Introduction to pandas Data Structures" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series([4, 7, -5, 3])", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.values", - "obj.index # like range(4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])", - "obj2", - "obj2.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2['a']", - "obj2['d'] = 6", - "obj2[['c', 'a', 'd']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2[obj2 > 0]", - "obj2 * 2", - "np.exp(obj2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'b' in obj2", - "'e' in obj2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}", - "obj3 = pd.Series(sdata)", - "obj3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = ['California', 'Ohio', 'Oregon', 'Texas']", - "obj4 = pd.Series(sdata, index=states)", - "obj4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.isnull(obj4)", - "pd.notnull(obj4)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj4.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj3", - "obj4", - "obj3 + obj4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj4.name = 'population'", - "obj4.index.name = 'state'", - "obj4" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj", - "obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "DataFrame" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],", - " 'year': [2000, 2001, 2002, 2001, 2002, 2003],", - " 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}", - "frame = pd.DataFrame(data)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.DataFrame(data, columns=['year', 'state', 'pop'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],", - " index=['one', 'two', 'three', 'four',", - " 'five', 'six'])", - "frame2", - "frame2.columns" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2['state']", - "frame2.year" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.loc['three']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2['debt'] = 16.5", - "frame2", - "frame2['debt'] = np.arange(6.)", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])", - "frame2['debt'] = val", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2['eastern'] = frame2.state == 'Ohio'", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "del frame2['eastern']", - "frame2.columns" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pop = {'Nevada': {2001: 2.4, 2002: 2.9},", - " 'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3 = pd.DataFrame(pop)", - "frame3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3.T" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.DataFrame(pop, index=[2001, 2002, 2003])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pdata = {'Ohio': frame3['Ohio'][:-1],", - " 'Nevada': frame3['Nevada'][:2]}", - "pd.DataFrame(pdata)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3.index.name = 'year'; frame3.columns.name = 'state'", - "frame3" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3.values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Index Objects" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series(range(3), index=['a', 'b', 'c'])", - "index = obj.index", - "index", - "index[1:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "index[1] = 'd' # TypeError" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "labels = pd.Index(np.arange(3))", - "labels", - "obj2 = pd.Series([1.5, -2.5, 0], index=labels)", - "obj2", - "obj2.index is labels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame3", - "frame3.columns", - "'Ohio' in frame3.columns", - "2003 in frame3.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])", - "dup_labels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Essential Functionality" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reindexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])", - "obj2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])", - "obj3", - "obj3.reindex(range(6), method='ffill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame(np.arange(9).reshape((3, 3)),", - " index=['a', 'c', 'd'],", - " columns=['Ohio', 'Texas', 'California'])", - "frame", - "frame2 = frame.reindex(['a', 'b', 'c', 'd'])", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = ['Texas', 'Utah', 'California']", - "frame.reindex(columns=states)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.loc[['a', 'b', 'c', 'd'], states]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Dropping Entries from an Axis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])", - "obj", - "new_obj = obj.drop('c')", - "new_obj", - "obj.drop(['d', 'c'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame(np.arange(16).reshape((4, 4)),", - " index=['Ohio', 'Colorado', 'Utah', 'New York'],", - " columns=['one', 'two', 'three', 'four'])", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop(['Colorado', 'Ohio'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop('two', axis=1)", - "data.drop(['two', 'four'], axis='columns')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.drop('c', inplace=True)", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Indexing, Selection, and Filtering" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])", - "obj", - "obj['b']", - "obj[1]", - "obj[2:4]", - "obj[['b', 'a', 'd']]", - "obj[[1, 3]]", - "obj[obj < 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj['b':'c']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj['b':'c'] = 5", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame(np.arange(16).reshape((4, 4)),", - " index=['Ohio', 'Colorado', 'Utah', 'New York'],", - " columns=['one', 'two', 'three', 'four'])", - "data", - "data['two']", - "data[['three', 'one']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[:2]", - "data[data['three'] > 5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data < 5", - "data[data < 5] = 0", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Selection with loc and iloc" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.loc['Colorado', ['two', 'three']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.iloc[2, [3, 0, 1]]", - "data.iloc[2]", - "data.iloc[[1, 2], [3, 0, 1]]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.loc[:'Utah', 'two']", - "data.iloc[:, :3][data.three > 5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Integer Indexes" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ser = pd.Series(np.arange(3.))", - "ser", - "ser[-1]" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser = pd.Series(np.arange(3.))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser2 = pd.Series(np.arange(3.), index=['a', 'b', 'c'])", - "ser2[-1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ser[:1]", - "ser.loc[:1]", - "ser.iloc[:1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Arithmetic and Data Alignment" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])", - "s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],", - " index=['a', 'c', 'e', 'f', 'g'])", - "s1", - "s2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 + s2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),", - " index=['Ohio', 'Texas', 'Colorado'])", - "df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),", - " index=['Utah', 'Ohio', 'Texas', 'Oregon'])", - "df1", - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 + df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame({'A': [1, 2]})", - "df2 = pd.DataFrame({'B': [3, 4]})", - "df1", - "df2", - "df1 - df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Arithmetic methods with fill values" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),", - " columns=list('abcd'))", - "df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),", - " columns=list('abcde'))", - "df2.loc[1, 'b'] = np.nan", - "df1", - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 + df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1.add(df2, fill_value=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "1 / df1", - "df1.rdiv(1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1.reindex(columns=df2.columns, fill_value=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Operations between DataFrame and Series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(12.).reshape((3, 4))", - "arr", - "arr[0]", - "arr - arr[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),", - " columns=list('bde'),", - " index=['Utah', 'Ohio', 'Texas', 'Oregon'])", - "series = frame.iloc[0]", - "frame", - "series" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame - series" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "series2 = pd.Series(range(3), index=['b', 'e', 'f'])", - "frame + series2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "series3 = frame['d']", - "frame", - "series3", - "frame.sub(series3, axis='index')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Function Application and Mapping" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),", - " index=['Utah', 'Ohio', 'Texas', 'Oregon'])", - "frame", - "np.abs(frame)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "f = lambda x: x.max() - x.min()", - "frame.apply(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.apply(f, axis='columns')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def f(x):", - " return pd.Series([x.min(), x.max()], index=['min', 'max'])", - "frame.apply(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "format = lambda x: '%.2f' % x", - "frame.applymap(format)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['e'].map(format)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Sorting and Ranking" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series(range(4), index=['d', 'a', 'b', 'c'])", - "obj.sort_index()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame(np.arange(8).reshape((2, 4)),", - " index=['three', 'one'],", - " columns=['d', 'a', 'b', 'c'])", - "frame.sort_index()", - "frame.sort_index(axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sort_index(axis=1, ascending=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series([4, 7, -3, 2])", - "obj.sort_values()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])", - "obj.sort_values()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})", - "frame", - "frame.sort_values(by='b')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sort_values(by=['a', 'b'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series([7, -5, 7, 4, 2, 0, 4])", - "obj.rank()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.rank(method='first')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Assign tie values the maximum rank in the group", - "obj.rank(ascending=False, method='max')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame({'b': [4.3, 7, -3, 2], 'a': [0, 1, 0, 1],", - " 'c': [-2, 5, 8, -2.5]})", - "frame", - "frame.rank(axis='columns')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Axis Indexes with Duplicate Labels" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])", - "obj" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.index.is_unique" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj['a']", - "obj['c']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])", - "df", - "df.loc['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Summarizing and Computing Descriptive Statistics" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],", - " [np.nan, np.nan], [0.75, -1.3]],", - " index=['a', 'b', 'c', 'd'],", - " columns=['one', 'two'])", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.sum(axis='columns')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.mean(axis='columns', skipna=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.idxmax()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.cumsum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series(['a', 'a', 'b', 'c'] * 4)", - "obj.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Correlation and Covariance" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "conda install pandas-datareader" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "price = pd.read_pickle('examples/yahoo_price.pkl')", - "volume = pd.read_pickle('examples/yahoo_volume.pkl')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "import pandas_datareader.data as web", - "all_data = {ticker: web.get_data_yahoo(ticker)", - " for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}", - "", - "price = pd.DataFrame({ticker: data['Adj Close']", - " for ticker, data in all_data.items()})", - "volume = pd.DataFrame({ticker: data['Volume']", - " for ticker, data in all_data.items()})" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns = price.pct_change()", - "returns.tail()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns['MSFT'].corr(returns['IBM'])", - "returns['MSFT'].cov(returns['IBM'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.MSFT.corr(returns.IBM)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.corr()", - "returns.cov()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.corrwith(returns.IBM)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "returns.corrwith(volume)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Unique Values, Value Counts, and Membership" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = pd.Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "uniques = obj.unique()", - "uniques" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj.value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.value_counts(obj.values, sort=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj", - "mask = obj.isin(['b', 'c'])", - "mask", - "obj[mask]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])", - "unique_vals = pd.Series(['c', 'b', 'a'])", - "pd.Index(unique_vals).get_indexer(to_match)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],", - " 'Qu2': [2, 3, 1, 2, 3],", - " 'Qu3': [1, 5, 2, 4, 4]})", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = data.apply(pd.value_counts).fillna(0)", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch06.ipynb b/ch06.ipynb index 0b0bc195a..5f95fe92d 100644 --- a/ch06.ipynb +++ b/ch06.ipynb @@ -1,1020 +1,1276 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Data Loading, Storage, " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Reading and Writing Data in Text Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!cat examples/ex1.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.read_csv('examples/ex1.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.read_table('examples/ex1.csv', sep=',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!cat examples/ex2.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.read_csv('examples/ex2.csv', header=None)\n", + "pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names = ['a', 'b', 'c', 'd', 'message']\n", + "pd.read_csv('examples/ex2.csv', names=names, index_col='message')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!cat examples/csv_mindex.csv\n", + "parsed = pd.read_csv('examples/csv_mindex.csv',\n", + " index_col=['key1', 'key2'])\n", + "parsed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "list(open('examples/ex3.txt'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = pd.read_table('examples/ex3.txt', sep='\\s+')\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!cat examples/ex4.csv\n", + "pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!cat examples/ex5.csv\n", + "result = pd.read_csv('examples/ex5.csv')\n", + "result\n", + "pd.isnull(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}\n", + "pd.read_csv('examples/ex5.csv', na_values=sentinels)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Reading Text Files in Pieces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = pd.read_csv('examples/ex6.csv')\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.read_csv('examples/ex6.csv', nrows=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)\n", + "chunker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)\n", + "\n", + "tot = pd.Series([])\n", + "for piece in chunker:\n", + " tot = tot.add(piece['key'].value_counts(), fill_value=0)\n", + "\n", + "tot = tot.sort_values(ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tot[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Writing Data to Text Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.read_csv('examples/ex5.csv')\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.to_csv('examples/out.csv')\n", + "!cat examples/out.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import sys\n", + "data.to_csv(sys.stdout, sep='|')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.to_csv(sys.stdout, na_rep='NULL')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.to_csv(sys.stdout, index=False, header=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dates = pd.date_range('1/1/2000', periods=7)\n", + "ts = pd.Series(np.arange(7), index=dates)\n", + "ts.to_csv('examples/tseries.csv')\n", + "!cat examples/tseries.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Working with Delimited Formats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!cat examples/ex7.csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import csv\n", + "f = open('examples/ex7.csv')\n", + "\n", + "reader = csv.reader(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for line in reader:\n", + " print(line)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "with open('examples/ex7.csv') as f:\n", + " lines = list(csv.reader(f))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "header, values = lines[0], lines[1:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data_dict = {h: v for h, v in zip(header, zip(*values))}\n", + "data_dict" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "class my_dialect(csv.Dialect):\n", + " lineterminator = '\\n'\n", + " delimiter = ';'\n", + " quotechar = '\"'\n", + " quoting = csv.QUOTE_MINIMAL" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "reader = csv.reader(f, dialect=my_dialect)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "reader = csv.reader(f, delimiter='|')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "with open('mydata.csv', 'w') as f:\n", + " writer = csv.writer(f, dialect=my_dialect)\n", + " writer.writerow(('one', 'two', 'three'))\n", + " writer.writerow(('1', '2', '3'))\n", + " writer.writerow(('4', '5', '6'))\n", + " writer.writerow(('7', '8', '9'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### JSON Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "obj = \"\"\"\n", + "{\"name\": \"Wes\",\n", + " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],\n", + " \"pet\": null,\n", + " \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]},\n", + " {\"name\": \"Katie\", \"age\": 38,\n", + " \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]\n", + "}\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import json\n", + "result = json.loads(obj)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "asjson = json.dumps(result)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])\n", + "siblings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!cat examples/example.json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.read_json('examples/example.json')\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "print(data.to_json())\n", + "print(data.to_json(orient='records'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### XML and HTML: Web Scraping" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "conda install lxml\n", + "pip install beautifulsoup4 html5lib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tables = pd.read_html('examples/fdic_failed_bank_list.html')\n", + "len(tables)\n", + "failures = tables[0]\n", + "failures.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "close_timestamps = pd.to_datetime(failures['Closing Date'])\n", + "close_timestamps.dt.year.value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Parsing XML with lxml.objectify" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "\n", + " 373889\n", + " \n", + " Metro-North Railroad\n", + " Escalator Availability\n", + " Percent of the time that escalators are operational\n", + " systemwide. The availability rate is based on physical observations performed\n", + " the morning of regular business days only. This is a new indicator the agency\n", + " began reporting in 2009.\n", + " 2011\n", + " 12\n", + " Service Indicators\n", + " M\n", + " U\n", + " %\n", + " 1\n", + " 97.00\n", + " \n", + " 97.00\n", + " \n", + "" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from lxml import objectify\n", + "\n", + "path = 'examples/mta_perf/Performance_MNR.xml'\n", + "parsed = objectify.parse(open(path))\n", + "root = parsed.getroot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = []\n", + "\n", + "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',\n", + " 'DESIRED_CHANGE', 'DECIMAL_PLACES']\n", + "\n", + "for elt in root.INDICATOR:\n", + " el_data = {}\n", + " for child in elt.getchildren():\n", + " if child.tag in skip_fields:\n", + " continue\n", + " el_data[child.tag] = child.pyval\n", + " data.append(el_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "perf = pd.DataFrame(data)\n", + "perf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "tag = 'Google'\n", + "root = objectify.parse(StringIO(tag)).getroot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "root\n", + "root.get('href')\n", + "root.text" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Binary Data Formats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.read_csv('examples/ex1.csv')\n", + "frame\n", + "frame.to_pickle('examples/frame_pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.read_pickle('examples/frame_pickle')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!rm examples/frame_pickle" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Using HDF5 Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame({'a': np.random.randn(100)})\n", + "store = pd.HDFStore('mydata.h5')\n", + "store['obj1'] = frame\n", + "store['obj1_col'] = frame['a']\n", + "store" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "store['obj1']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "store.put('obj2', frame, format='table')\n", + "store.select('obj2', where=['index >= 10 and index <= 15'])\n", + "store.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.to_hdf('mydata.h5', 'obj3', format='table')\n", + "pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "os.remove('mydata.h5')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Reading Microsoft Excel Files" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "xlsx = pd.ExcelFile('examples/ex1.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.read_excel(xlsx, 'Sheet1')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "writer = pd.ExcelWriter('examples/ex2.xlsx')\n", + "frame.to_excel(writer, 'Sheet1')\n", + "writer.save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.to_excel('examples/ex2.xlsx')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!rm examples/ex2.xlsx" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Interacting with Web APIs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import requests\n", + "url = 'https://api.github.com/repos/pandas-dev/pandas/issues'\n", + "resp = requests.get(url)\n", + "resp" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = resp.json()\n", + "data[0]['title']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "issues = pd.DataFrame(data, columns=['number', 'title',\n", + " 'labels', 'state'])\n", + "issues" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Interacting with Databases" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import sqlite3\n", + "query = \"\"\"\n", + "CREATE TABLE test\n", + "(a VARCHAR(20), b VARCHAR(20),\n", + " c REAL, d INTEGER\n", + ");\"\"\"\n", + "con = sqlite3.connect('mydata.sqlite')\n", + "con.execute(query)\n", + "con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = [('Atlanta', 'Georgia', 1.25, 6),\n", + " ('Tallahassee', 'Florida', 2.6, 3),\n", + " ('Sacramento', 'California', 1.7, 5)]\n", + "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"\n", + "con.executemany(stmt, data)\n", + "con.commit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cursor = con.execute('select * from test')\n", + "rows = cursor.fetchall()\n", + "rows" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cursor.description\n", + "pd.DataFrame(rows, columns=[x[0] for x in cursor.description])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import sqlalchemy as sqla\n", + "db = sqla.create_engine('sqlite:///mydata.sqlite')\n", + "pd.read_sql('select * from test', db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!rm mydata.sqlite" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch06" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Data Loading, Storage, " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Reading and Writing Data in Text Format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat examples/ex1.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.read_csv('examples/ex1.csv')", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_table('examples/ex1.csv', sep=',')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat examples/ex2.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_csv('examples/ex2.csv', header=None)", - "pd.read_csv('examples/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names = ['a', 'b', 'c', 'd', 'message']", - "pd.read_csv('examples/ex2.csv', names=names, index_col='message')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat examples/csv_mindex.csv", - "parsed = pd.read_csv('examples/csv_mindex.csv',", - " index_col=['key1', 'key2'])", - "parsed" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "list(open('examples/ex3.txt'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.read_table('examples/ex3.txt', sep='\\s+')", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat examples/ex4.csv", - "pd.read_csv('examples/ex4.csv', skiprows=[0, 2, 3])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat examples/ex5.csv", - "result = pd.read_csv('examples/ex5.csv')", - "result", - "pd.isnull(result)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.read_csv('examples/ex5.csv', na_values=['NULL'])", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sentinels = {'message': ['foo', 'NA'], 'something': ['two']}", - "pd.read_csv('examples/ex5.csv', na_values=sentinels)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reading Text Files in Pieces" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = 10" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.read_csv('examples/ex6.csv')", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_csv('examples/ex6.csv', nrows=5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)", - "chunker" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "chunker = pd.read_csv('examples/ex6.csv', chunksize=1000)", - "", - "tot = pd.Series([])", - "for piece in chunker:", - " tot = tot.add(piece['key'].value_counts(), fill_value=0)", - "", - "tot = tot.sort_values(ascending=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tot[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Writing Data to Text Format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_csv('examples/ex5.csv')", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv('examples/out.csv')", - "!cat examples/out.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import sys", - "data.to_csv(sys.stdout, sep='|')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv(sys.stdout, na_rep='NULL')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv(sys.stdout, index=False, header=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.to_csv(sys.stdout, index=False, columns=['a', 'b', 'c'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dates = pd.date_range('1/1/2000', periods=7)", - "ts = pd.Series(np.arange(7), index=dates)", - "ts.to_csv('examples/tseries.csv')", - "!cat examples/tseries.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Working with Delimited Formats" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat examples/ex7.csv" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import csv", - "f = open('examples/ex7.csv')", - "", - "reader = csv.reader(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for line in reader:", - " print(line)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "with open('examples/ex7.csv') as f:", - " lines = list(csv.reader(f))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "header, values = lines[0], lines[1:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data_dict = {h: v for h, v in zip(header, zip(*values))}", - "data_dict" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "class my_dialect(csv.Dialect):", - " lineterminator = '\\n'", - " delimiter = ';'", - " quotechar = '\"'", - " quoting = csv.QUOTE_MINIMAL" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "reader = csv.reader(f, dialect=my_dialect)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "reader = csv.reader(f, delimiter='|')" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "with open('mydata.csv', 'w') as f:", - " writer = csv.writer(f, dialect=my_dialect)", - " writer.writerow(('one', 'two', 'three'))", - " writer.writerow(('1', '2', '3'))", - " writer.writerow(('4', '5', '6'))", - " writer.writerow(('7', '8', '9'))" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "JSON Data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "obj = \"\"\"", - "{\"name\": \"Wes\",", - " \"places_lived\": [\"United States\", \"Spain\", \"Germany\"],", - " \"pet\": null,", - " \"siblings\": [{\"name\": \"Scott\", \"age\": 30, \"pets\": [\"Zeus\", \"Zuko\"]},", - " {\"name\": \"Katie\", \"age\": 38,", - " \"pets\": [\"Sixes\", \"Stache\", \"Cisco\"]}]", - "}", - "\"\"\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import json", - "result = json.loads(obj)", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "asjson = json.dumps(result)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "siblings = pd.DataFrame(result['siblings'], columns=['name', 'age'])", - "siblings" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!cat examples/example.json" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_json('examples/example.json')", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(data.to_json())", - "print(data.to_json(orient='records'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "XML and HTML: Web Scraping" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "conda install lxml", - "pip install beautifulsoup4 html5lib" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tables = pd.read_html('examples/fdic_failed_bank_list.html')", - "len(tables)", - "failures = tables[0]", - "failures.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_timestamps = pd.to_datetime(failures['Closing Date'])", - "close_timestamps.dt.year.value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Parsing XML with lxml.objectify" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "", - " 373889", - " ", - " Metro-North Railroad", - " Escalator Availability", - " Percent of the time that escalators are operational", - " systemwide. The availability rate is based on physical observations performed", - " the morning of regular business days only. This is a new indicator the agency", - " began reporting in 2009.", - " 2011", - " 12", - " Service Indicators", - " M", - " U", - " %", - " 1", - " 97.00", - " ", - " 97.00", - " ", - "" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from lxml import objectify", - "", - "path = 'examples/mta_perf/Performance_MNR.xml'", - "parsed = objectify.parse(open(path))", - "root = parsed.getroot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = []", - "", - "skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ',", - " 'DESIRED_CHANGE', 'DECIMAL_PLACES']", - "", - "for elt in root.INDICATOR:", - " el_data = {}", - " for child in elt.getchildren():", - " if child.tag in skip_fields:", - " continue", - " el_data[child.tag] = child.pyval", - " data.append(el_data)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "perf = pd.DataFrame(data)", - "perf.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from io import StringIO", - "tag = 'Google'", - "root = objectify.parse(StringIO(tag)).getroot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "root", - "root.get('href')", - "root.text" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Binary Data Formats" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.read_csv('examples/ex1.csv')", - "frame", - "frame.to_pickle('examples/frame_pickle')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_pickle('examples/frame_pickle')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!rm examples/frame_pickle" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Using HDF5 Format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame({'a': np.random.randn(100)})", - "store = pd.HDFStore('mydata.h5')", - "store['obj1'] = frame", - "store['obj1_col'] = frame['a']", - "store" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "store['obj1']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "store.put('obj2', frame, format='table')", - "store.select('obj2', where=['index >= 10 and index <= 15'])", - "store.close()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.to_hdf('mydata.h5', 'obj3', format='table')", - "pd.read_hdf('mydata.h5', 'obj3', where=['index < 5'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "os.remove('mydata.h5')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reading Microsoft Excel Files" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "xlsx = pd.ExcelFile('examples/ex1.xlsx')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.read_excel(xlsx, 'Sheet1')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.read_excel('examples/ex1.xlsx', 'Sheet1')", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "writer = pd.ExcelWriter('examples/ex2.xlsx')", - "frame.to_excel(writer, 'Sheet1')", - "writer.save()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.to_excel('examples/ex2.xlsx')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!rm examples/ex2.xlsx" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Interacting with Web APIs" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import requests", - "url = 'https://api.github.com/repos/pandas-dev/pandas/issues'", - "resp = requests.get(url)", - "resp" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = resp.json()", - "data[0]['title']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "issues = pd.DataFrame(data, columns=['number', 'title',", - " 'labels', 'state'])", - "issues" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Interacting with Databases" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import sqlite3", - "query = \"\"\"", - "CREATE TABLE test", - "(a VARCHAR(20), b VARCHAR(20),", - " c REAL, d INTEGER", - ");\"\"\"", - "con = sqlite3.connect('mydata.sqlite')", - "con.execute(query)", - "con.commit()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = [('Atlanta', 'Georgia', 1.25, 6),", - " ('Tallahassee', 'Florida', 2.6, 3),", - " ('Sacramento', 'California', 1.7, 5)]", - "stmt = \"INSERT INTO test VALUES(?, ?, ?, ?)\"", - "con.executemany(stmt, data)", - "con.commit()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cursor = con.execute('select * from test')", - "rows = cursor.fetchall()", - "rows" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cursor.description", - "pd.DataFrame(rows, columns=[x[0] for x in cursor.description])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import sqlalchemy as sqla", - "db = sqla.create_engine('sqlite:///mydata.sqlite')", - "pd.read_sql('select * from test', db)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!rm mydata.sqlite" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch07.ipynb b/ch07.ipynb index df4208afb..571a14001 100644 --- a/ch07.ipynb +++ b/ch07.ipynb @@ -1,1168 +1,1468 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Data Cleaning and Preparation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Handling Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])\n", + "string_data\n", + "string_data.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "string_data[0] = None\n", + "string_data.isnull()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Filtering Out Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from numpy import nan as NA\n", + "data = pd.Series([1, NA, 3.5, NA, 7])\n", + "data.dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[data.notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],\n", + " [NA, NA, NA], [NA, 6.5, 3.]])\n", + "cleaned = data.dropna()\n", + "data\n", + "cleaned" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.dropna(how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[4] = NA\n", + "data\n", + "data.dropna(axis=1, how='all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.random.randn(7, 3))\n", + "df.iloc[:4, 1] = NA\n", + "df.iloc[:2, 2] = NA\n", + "df\n", + "df.dropna()\n", + "df.dropna(thresh=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Filling In Missing Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.fillna({1: 0.5, 2: 0})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "_ = df.fillna(0, inplace=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.random.randn(6, 3))\n", + "df.iloc[2:, 1] = NA\n", + "df.iloc[4:, 2] = NA\n", + "df\n", + "df.fillna(method='ffill')\n", + "df.fillna(method='ffill', limit=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.Series([1., NA, 3.5, NA, 7])\n", + "data.fillna(data.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Data Transformation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Removing Duplicates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],\n", + " 'k2': [1, 1, 2, 3, 3, 4, 4]})\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.duplicated()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data['v1'] = range(7)\n", + "data.drop_duplicates(['k1'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.drop_duplicates(['k1', 'k2'], keep='last')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Transforming Data Using a Function or Mapping" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',\n", + " 'Pastrami', 'corned beef', 'Bacon',\n", + " 'pastrami', 'honey ham', 'nova lox'],\n", + " 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "meat_to_animal = {\n", + " 'bacon': 'pig',\n", + " 'pulled pork': 'pig',\n", + " 'pastrami': 'cow',\n", + " 'corned beef': 'cow',\n", + " 'honey ham': 'pig',\n", + " 'nova lox': 'salmon'\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "lowercased = data['food'].str.lower()\n", + "lowercased\n", + "data['animal'] = lowercased.map(meat_to_animal)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data['food'].map(lambda x: meat_to_animal[x.lower()])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Replacing Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.Series([1., -999., 2., -999., -1000., 3.])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.replace(-999, np.nan)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.replace([-999, -1000], np.nan)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.replace([-999, -1000], [np.nan, 0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.replace({-999: np.nan, -1000: 0})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Renaming Axis Indexes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame(np.arange(12).reshape((3, 4)),\n", + " index=['Ohio', 'Colorado', 'New York'],\n", + " columns=['one', 'two', 'three', 'four'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "transform = lambda x: x[:4].upper()\n", + "data.index.map(transform)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.index = data.index.map(transform)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.rename(index=str.title, columns=str.upper)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.rename(index={'OHIO': 'INDIANA'},\n", + " columns={'three': 'peekaboo'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.rename(index={'OHIO': 'INDIANA'}, inplace=True)\n", + "data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Discretization and Binning" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bins = [18, 25, 35, 60, 100]\n", + "cats = pd.cut(ages, bins)\n", + "cats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cats.codes\n", + "cats.categories\n", + "pd.value_counts(cats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.cut(ages, [18, 26, 36, 61, 100], right=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']\n", + "pd.cut(ages, bins, labels=group_names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = np.random.rand(20)\n", + "pd.cut(data, 4, precision=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = np.random.randn(1000) # Normally distributed\n", + "cats = pd.qcut(data, 4) # Cut into quartiles\n", + "cats\n", + "pd.value_counts(cats)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Detecting and Filtering Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame(np.random.randn(1000, 4))\n", + "data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "col = data[2]\n", + "col[np.abs(col) > 3]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[(np.abs(data) > 3).any(1)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[np.abs(data) > 3] = np.sign(data) * 3\n", + "data.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.sign(data).head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Permutation and Random Sampling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))\n", + "sampler = np.random.permutation(5)\n", + "sampler" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df\n", + "df.take(sampler)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.sample(n=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "choices = pd.Series([5, 7, -1, 6, 4])\n", + "draws = choices.sample(n=10, replace=True)\n", + "draws" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Computing Indicator/Dummy Variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", + " 'data1': range(6)})\n", + "pd.get_dummies(df['key'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dummies = pd.get_dummies(df['key'], prefix='key')\n", + "df_with_dummy = df[['data1']].join(dummies)\n", + "df_with_dummy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mnames = ['movie_id', 'title', 'genres']\n", + "movies = pd.read_table('datasets/movielens/movies.dat', sep='::',\n", + " header=None, names=mnames)\n", + "movies[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "all_genres = []\n", + "for x in movies.genres:\n", + " all_genres.extend(x.split('|'))\n", + "genres = pd.unique(all_genres)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "genres" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "zero_matrix = np.zeros((len(movies), len(genres)))\n", + "dummies = pd.DataFrame(zero_matrix, columns=genres)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "gen = movies.genres[0]\n", + "gen.split('|')\n", + "dummies.columns.get_indexer(gen.split('|'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for i, gen in enumerate(movies.genres):\n", + " indices = dummies.columns.get_indexer(gen.split('|'))\n", + " dummies.iloc[i, indices] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "movies_windic = movies.join(dummies.add_prefix('Genre_'))\n", + "movies_windic.iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.random.seed(12345)\n", + "values = np.random.rand(10)\n", + "values\n", + "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]\n", + "pd.get_dummies(pd.cut(values, bins))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## String Manipulation" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### String Object Methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val = 'a,b, guido'\n", + "val.split(',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pieces = [x.strip() for x in val.split(',')]\n", + "pieces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "first, second, third = pieces\n", + "first + '::' + second + '::' + third" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "'::'.join(pieces)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "'guido' in val\n", + "val.index(',')\n", + "val.find(':')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val.index(':')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val.count(',')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "val.replace(',', '::')\n", + "val.replace(',', '')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Regular Expressions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import re\n", + "text = \"foo bar\\t baz \\tqux\"\n", + "re.split('\\s+', text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "regex = re.compile('\\s+')\n", + "regex.split(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "regex.findall(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "text = \"\"\"Dave dave@google.com\n", + "Steve steve@gmail.com\n", + "Rob rob@gmail.com\n", + "Ryan ryan@yahoo.com\n", + "\"\"\"\n", + "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'\n", + "\n", + "# re.IGNORECASE makes the regex case-insensitive\n", + "regex = re.compile(pattern, flags=re.IGNORECASE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "regex.findall(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "m = regex.search(text)\n", + "m\n", + "text[m.start():m.end()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "print(regex.match(text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "print(regex.sub('REDACTED', text))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'\n", + "regex = re.compile(pattern, flags=re.IGNORECASE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "m = regex.match('wesm@bright.net')\n", + "m.groups()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "regex.findall(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "print(regex.sub(r'Username: \\1, Domain: \\2, Suffix: \\3', text))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Vectorized String Functions in pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',\n", + " 'Rob': 'rob@gmail.com', 'Wes': np.nan}\n", + "data = pd.Series(data)\n", + "data\n", + "data.isnull()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.str.contains('gmail')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pattern\n", + "data.str.findall(pattern, flags=re.IGNORECASE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "matches = data.str.match(pattern, flags=re.IGNORECASE)\n", + "matches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "matches.str.get(1)\n", + "matches.str[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.str[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch07" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Data Cleaning and Preparation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Handling Missing Data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])", - "string_data", - "string_data.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "string_data[0] = None", - "string_data.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Filtering Out Missing Data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy import nan as NA", - "data = pd.Series([1, NA, 3.5, NA, 7])", - "data.dropna()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[data.notnull()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],", - " [NA, NA, NA], [NA, 6.5, 3.]])", - "cleaned = data.dropna()", - "data", - "cleaned" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.dropna(how='all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[4] = NA", - "data", - "data.dropna(axis=1, how='all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame(np.random.randn(7, 3))", - "df.iloc[:4, 1] = NA", - "df.iloc[:2, 2] = NA", - "df", - "df.dropna()", - "df.dropna(thresh=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Filling In Missing Data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.fillna(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.fillna({1: 0.5, 2: 0})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "_ = df.fillna(0, inplace=True)", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame(np.random.randn(6, 3))", - "df.iloc[2:, 1] = NA", - "df.iloc[4:, 2] = NA", - "df", - "df.fillna(method='ffill')", - "df.fillna(method='ffill', limit=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.Series([1., NA, 3.5, NA, 7])", - "data.fillna(data.mean())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data Transformation" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Removing Duplicates" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],", - " 'k2': [1, 1, 2, 3, 3, 4, 4]})", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.duplicated()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop_duplicates()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['v1'] = range(7)", - "data.drop_duplicates(['k1'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.drop_duplicates(['k1', 'k2'], keep='last')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Transforming Data Using a Function or Mapping" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',", - " 'Pastrami', 'corned beef', 'Bacon',", - " 'pastrami', 'honey ham', 'nova lox'],", - " 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "meat_to_animal = {", - " 'bacon': 'pig',", - " 'pulled pork': 'pig',", - " 'pastrami': 'cow',", - " 'corned beef': 'cow',", - " 'honey ham': 'pig',", - " 'nova lox': 'salmon'", - "}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "lowercased = data['food'].str.lower()", - "lowercased", - "data['animal'] = lowercased.map(meat_to_animal)", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['food'].map(lambda x: meat_to_animal[x.lower()])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Replacing Values" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.Series([1., -999., 2., -999., -1000., 3.])", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace(-999, np.nan)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace([-999, -1000], np.nan)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace([-999, -1000], [np.nan, 0])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.replace({-999: np.nan, -1000: 0})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Renaming Axis Indexes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame(np.arange(12).reshape((3, 4)),", - " index=['Ohio', 'Colorado', 'New York'],", - " columns=['one', 'two', 'three', 'four'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "transform = lambda x: x[:4].upper()", - "data.index.map(transform)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.index = data.index.map(transform)", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.rename(index=str.title, columns=str.upper)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.rename(index={'OHIO': 'INDIANA'},", - " columns={'three': 'peekaboo'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.rename(index={'OHIO': 'INDIANA'}, inplace=True)", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Discretization and Binning" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = [18, 25, 35, 60, 100]", - "cats = pd.cut(ages, bins)", - "cats" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cats.codes", - "cats.categories", - "pd.value_counts(cats)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.cut(ages, [18, 26, 36, 61, 100], right=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']", - "pd.cut(ages, bins, labels=group_names)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = np.random.rand(20)", - "pd.cut(data, 4, precision=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = np.random.randn(1000) # Normally distributed", - "cats = pd.qcut(data, 4) # Cut into quartiles", - "cats", - "pd.value_counts(cats)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Detecting and Filtering Outliers" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame(np.random.randn(1000, 4))", - "data.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "col = data[2]", - "col[np.abs(col) > 3]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[(np.abs(data) > 3).any(1)]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[np.abs(data) > 3] = np.sign(data) * 3", - "data.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.sign(data).head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Permutation and Random Sampling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))", - "sampler = np.random.permutation(5)", - "sampler" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df", - "df.take(sampler)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.sample(n=3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "choices = pd.Series([5, 7, -1, 6, 4])", - "draws = choices.sample(n=10, replace=True)", - "draws" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Computing Indicator/Dummy Variables" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],", - " 'data1': range(6)})", - "pd.get_dummies(df['key'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dummies = pd.get_dummies(df['key'], prefix='key')", - "df_with_dummy = df[['data1']].join(dummies)", - "df_with_dummy" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mnames = ['movie_id', 'title', 'genres']", - "movies = pd.read_table('datasets/movielens/movies.dat', sep='::',", - " header=None, names=mnames)", - "movies[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "all_genres = []", - "for x in movies.genres:", - " all_genres.extend(x.split('|'))", - "genres = pd.unique(all_genres)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "genres" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "zero_matrix = np.zeros((len(movies), len(genres)))", - "dummies = pd.DataFrame(zero_matrix, columns=genres)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "gen = movies.genres[0]", - "gen.split('|')", - "dummies.columns.get_indexer(gen.split('|'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for i, gen in enumerate(movies.genres):", - " indices = dummies.columns.get_indexer(gen.split('|'))", - " dummies.iloc[i, indices] = 1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "movies_windic = movies.join(dummies.add_prefix('Genre_'))", - "movies_windic.iloc[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)", - "values = np.random.rand(10)", - "values", - "bins = [0, 0.2, 0.4, 0.6, 0.8, 1]", - "pd.get_dummies(pd.cut(values, bins))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "String Manipulation" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "String Object Methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val = 'a,b, guido'", - "val.split(',')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pieces = [x.strip() for x in val.split(',')]", - "pieces" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "first, second, third = pieces", - "first + '::' + second + '::' + third" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'::'.join(pieces)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "'guido' in val", - "val.index(',')", - "val.find(':')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.index(':')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.count(',')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "val.replace(',', '::')", - "val.replace(',', '')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Regular Expressions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import re", - "text = \"foo bar\\t baz \\tqux\"", - "re.split('\\s+', text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex = re.compile('\\s+')", - "regex.split(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex.findall(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "text = \"\"\"Dave dave@google.com", - "Steve steve@gmail.com", - "Rob rob@gmail.com", - "Ryan ryan@yahoo.com", - "\"\"\"", - "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'", - "", - "# re.IGNORECASE makes the regex case-insensitive", - "regex = re.compile(pattern, flags=re.IGNORECASE)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex.findall(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "m = regex.search(text)", - "m", - "text[m.start():m.end()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(regex.match(text))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(regex.sub('REDACTED', text))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'", - "regex = re.compile(pattern, flags=re.IGNORECASE)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "m = regex.match('wesm@bright.net')", - "m.groups()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "regex.findall(text)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(regex.sub(r'Username: \\1, Domain: \\2, Suffix: \\3', text))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Vectorized String Functions in pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',", - " 'Rob': 'rob@gmail.com', 'Wes': np.nan}", - "data = pd.Series(data)", - "data", - "data.isnull()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.str.contains('gmail')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pattern", - "data.str.findall(pattern, flags=re.IGNORECASE)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "matches = data.str.match(pattern, flags=re.IGNORECASE)", - "matches" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "matches.str.get(1)", - "matches.str[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.str[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch08.ipynb b/ch08.ipynb index 3dfb10b45..e85389665 100644 --- a/ch08.ipynb +++ b/ch08.ipynb @@ -1,996 +1,1239 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Data Wrangling: Join, Combine, " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "pd.options.display.max_rows = 20\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Hierarchical Indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.Series(np.random.randn(9),\n", + " index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],\n", + " [1, 2, 3, 1, 3, 1, 2, 2, 3]])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data['b']\n", + "data['b':'c']\n", + "data.loc[['b', 'd']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.loc[:, 2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data.unstack().stack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.arange(12).reshape((4, 3)),\n", + " index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],\n", + " columns=[['Ohio', 'Ohio', 'Colorado'],\n", + " ['Green', 'Red', 'Green']])\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.index.names = ['key1', 'key2']\n", + "frame.columns.names = ['state', 'color']\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame['Ohio']" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],\n", + " names=['state', 'color'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Reordering and Sorting Levels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.swaplevel('key1', 'key2')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.sort_index(level=1)\n", + "frame.swaplevel(0, 1).sort_index(level=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Summary Statistics by Level" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.sum(level='key2')\n", + "frame.sum(level='color', axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Indexing with a DataFrame's columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),\n", + " 'c': ['one', 'one', 'one', 'two', 'two',\n", + " 'two', 'two'],\n", + " 'd': [0, 1, 2, 0, 1, 2, 3]})\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2 = frame.set_index(['c', 'd'])\n", + "frame2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.set_index(['c', 'd'], drop=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame2.reset_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Combining and Merging Datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Database-Style DataFrame Joins" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],\n", + " 'data1': range(7)})\n", + "df2 = pd.DataFrame({'key': ['a', 'b', 'd'],\n", + " 'data2': range(3)})\n", + "df1\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2, on='key')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],\n", + " 'data1': range(7)})\n", + "df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],\n", + " 'data2': range(3)})\n", + "pd.merge(df3, df4, left_on='lkey', right_on='rkey')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],\n", + " 'data1': range(6)})\n", + "df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],\n", + " 'data2': range(5)})\n", + "df1\n", + "df2\n", + "pd.merge(df1, df2, on='key', how='left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.merge(df1, df2, how='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],\n", + " 'key2': ['one', 'two', 'one'],\n", + " 'lval': [1, 2, 3]})\n", + "right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],\n", + " 'key2': ['one', 'one', 'one', 'two'],\n", + " 'rval': [4, 5, 6, 7]})\n", + "pd.merge(left, right, on=['key1', 'key2'], how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.merge(left, right, on='key1')\n", + "pd.merge(left, right, on='key1', suffixes=('_left', '_right'))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Merging on Index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],\n", + " 'value': range(6)})\n", + "right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])\n", + "left1\n", + "right1\n", + "pd.merge(left1, right1, left_on='key', right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.merge(left1, right1, left_on='key', right_index=True, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',\n", + " 'Nevada', 'Nevada'],\n", + " 'key2': [2000, 2001, 2002, 2001, 2002],\n", + " 'data': np.arange(5.)})\n", + "righth = pd.DataFrame(np.arange(12).reshape((6, 2)),\n", + " index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',\n", + " 'Ohio', 'Ohio'],\n", + " [2001, 2000, 2000, 2000, 2001, 2002]],\n", + " columns=['event1', 'event2'])\n", + "lefth\n", + "righth" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)\n", + "pd.merge(lefth, righth, left_on=['key1', 'key2'],\n", + " right_index=True, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],\n", + " index=['a', 'c', 'e'],\n", + " columns=['Ohio', 'Nevada'])\n", + "right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],\n", + " index=['b', 'c', 'd', 'e'],\n", + " columns=['Missouri', 'Alabama'])\n", + "left2\n", + "right2\n", + "pd.merge(left2, right2, how='outer', left_index=True, right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "left2.join(right2, how='outer')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "left1.join(right1, on='key')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],\n", + " index=['a', 'c', 'e', 'f'],\n", + " columns=['New York', 'Oregon'])\n", + "another\n", + "left2.join([right2, another])\n", + "left2.join([right2, another], how='outer')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Concatenating Along an Axis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "arr = np.arange(12).reshape((3, 4))\n", + "arr\n", + "np.concatenate([arr, arr], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s1 = pd.Series([0, 1], index=['a', 'b'])\n", + "s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])\n", + "s3 = pd.Series([5, 6], index=['f', 'g'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.concat([s1, s2, s3])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.concat([s1, s2, s3], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s4 = pd.concat([s1, s3])\n", + "s4\n", + "pd.concat([s1, s4], axis=1)\n", + "pd.concat([s1, s4], axis=1, join='inner')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])\n", + "result\n", + "result.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],\n", + " columns=['one', 'two'])\n", + "df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],\n", + " columns=['three', 'four'])\n", + "df1\n", + "df2\n", + "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.concat({'level1': df1, 'level2': df2}, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],\n", + " names=['upper', 'lower'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])\n", + "df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])\n", + "df1\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.concat([df1, df2], ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Combining Data with Overlap" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],\n", + " index=['f', 'e', 'd', 'c', 'b', 'a'])\n", + "b = pd.Series(np.arange(len(a), dtype=np.float64),\n", + " index=['f', 'e', 'd', 'c', 'b', 'a'])\n", + "b[-1] = np.nan\n", + "a\n", + "b\n", + "np.where(pd.isnull(a), b, a)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "b[:-2].combine_first(a[2:])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],\n", + " 'b': [np.nan, 2., np.nan, 6.],\n", + " 'c': range(2, 18, 4)})\n", + "df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],\n", + " 'b': [np.nan, 3., 4., 6., 8.]})\n", + "df1\n", + "df2\n", + "df1.combine_first(df2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Reshaping and Pivoting" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Reshaping with Hierarchical Indexing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame(np.arange(6).reshape((2, 3)),\n", + " index=pd.Index(['Ohio', 'Colorado'], name='state'),\n", + " columns=pd.Index(['one', 'two', 'three'],\n", + " name='number'))\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = data.stack()\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result.unstack(0)\n", + "result.unstack('state')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])\n", + "s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])\n", + "data2 = pd.concat([s1, s2], keys=['one', 'two'])\n", + "data2\n", + "data2.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data2.unstack()\n", + "data2.unstack().stack()\n", + "data2.unstack().stack(dropna=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame({'left': result, 'right': result + 5},\n", + " columns=pd.Index(['left', 'right'], name='side'))\n", + "df\n", + "df.unstack('state')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.unstack('state').stack('side')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Pivoting “Long” to “Wide” Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.read_csv('examples/macrodata.csv')\n", + "data.head()\n", + "periods = pd.PeriodIndex(year=data.year, quarter=data.quarter,\n", + " name='date')\n", + "columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')\n", + "data = data.reindex(columns=columns)\n", + "data.index = periods.to_timestamp('D', 'end')\n", + "ldata = data.stack().reset_index().rename(columns={0: 'value'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ldata[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pivoted = ldata.pivot('date', 'item', 'value')\n", + "pivoted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ldata['value2'] = np.random.randn(len(ldata))\n", + "ldata[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pivoted = ldata.pivot('date', 'item')\n", + "pivoted[:5]\n", + "pivoted['value'][:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "unstacked = ldata.set_index(['date', 'item']).unstack('item')\n", + "unstacked[:7]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Pivoting “Wide” to “Long” Format" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],\n", + " 'A': [1, 2, 3],\n", + " 'B': [4, 5, 6],\n", + " 'C': [7, 8, 9]})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "melted = pd.melt(df, ['key'])\n", + "melted" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "reshaped = melted.pivot('key', 'variable', 'value')\n", + "reshaped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "reshaped.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.melt(df, value_vars=['A', 'B', 'C'])\n", + "pd.melt(df, value_vars=['key', 'A', 'B'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch08" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Data Wrangling: Join, Combine, " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "pd.options.display.max_rows = 20", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Hierarchical Indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.Series(np.random.randn(9),", - " index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'],", - " [1, 2, 3, 1, 3, 1, 2, 2, 3]])", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['b']", - "data['b':'c']", - "data.loc[['b', 'd']]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.loc[:, 2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data.unstack().stack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame(np.arange(12).reshape((4, 3)),", - " index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],", - " columns=[['Ohio', 'Ohio', 'Colorado'],", - " ['Green', 'Red', 'Green']])", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.index.names = ['key1', 'key2']", - "frame.columns.names = ['state', 'color']", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['Ohio']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'], ['Green', 'Red', 'Green']],", - " names=['state', 'color'])" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reordering and Sorting Levels" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.swaplevel('key1', 'key2')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sort_index(level=1)", - "frame.swaplevel(0, 1).sort_index(level=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Summary Statistics by Level" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.sum(level='key2')", - "frame.sum(level='color', axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Indexing with a DataFrame's columns" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),", - " 'c': ['one', 'one', 'one', 'two', 'two',", - " 'two', 'two'],", - " 'd': [0, 1, 2, 0, 1, 2, 3]})", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2 = frame.set_index(['c', 'd'])", - "frame2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.set_index(['c', 'd'], drop=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame2.reset_index()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Combining and Merging Datasets" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Database-Style DataFrame Joins" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],", - " 'data1': range(7)})", - "df2 = pd.DataFrame({'key': ['a', 'b', 'd'],", - " 'data2': range(3)})", - "df1", - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2, on='key')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],", - " 'data1': range(7)})", - "df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],", - " 'data2': range(3)})", - "pd.merge(df3, df4, left_on='lkey', right_on='rkey')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],", - " 'data1': range(6)})", - "df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],", - " 'data2': range(5)})", - "df1", - "df2", - "pd.merge(df1, df2, on='key', how='left')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(df1, df2, how='inner')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],", - " 'key2': ['one', 'two', 'one'],", - " 'lval': [1, 2, 3]})", - "right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],", - " 'key2': ['one', 'one', 'one', 'two'],", - " 'rval': [4, 5, 6, 7]})", - "pd.merge(left, right, on=['key1', 'key2'], how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(left, right, on='key1')", - "pd.merge(left, right, on='key1', suffixes=('_left', '_right'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Merging on Index" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],", - " 'value': range(6)})", - "right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])", - "left1", - "right1", - "pd.merge(left1, right1, left_on='key', right_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(left1, right1, left_on='key', right_index=True, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio',", - " 'Nevada', 'Nevada'],", - " 'key2': [2000, 2001, 2002, 2001, 2002],", - " 'data': np.arange(5.)})", - "righth = pd.DataFrame(np.arange(12).reshape((6, 2)),", - " index=[['Nevada', 'Nevada', 'Ohio', 'Ohio',", - " 'Ohio', 'Ohio'],", - " [2001, 2000, 2000, 2000, 2001, 2002]],", - " columns=['event1', 'event2'])", - "lefth", - "righth" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.merge(lefth, righth, left_on=['key1', 'key2'], right_index=True)", - "pd.merge(lefth, righth, left_on=['key1', 'key2'],", - " right_index=True, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],", - " index=['a', 'c', 'e'],", - " columns=['Ohio', 'Nevada'])", - "right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],", - " index=['b', 'c', 'd', 'e'],", - " columns=['Missouri', 'Alabama'])", - "left2", - "right2", - "pd.merge(left2, right2, how='outer', left_index=True, right_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left2.join(right2, how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "left1.join(right1, on='key')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]],", - " index=['a', 'c', 'e', 'f'],", - " columns=['New York', 'Oregon'])", - "another", - "left2.join([right2, another])", - "left2.join([right2, another], how='outer')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Concatenating Along an Axis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "arr = np.arange(12).reshape((3, 4))", - "arr", - "np.concatenate([arr, arr], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 = pd.Series([0, 1], index=['a', 'b'])", - "s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])", - "s3 = pd.Series([5, 6], index=['f', 'g'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s2, s3])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s2, s3], axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s4 = pd.concat([s1, s3])", - "s4", - "pd.concat([s1, s4], axis=1)", - "pd.concat([s1, s4], axis=1, join='inner')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])", - "result", - "result.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],", - " columns=['one', 'two'])", - "df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],", - " columns=['three', 'four'])", - "df1", - "df2", - "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat({'level1': df1, 'level2': df2}, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],", - " names=['upper', 'lower'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])", - "df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])", - "df1", - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.concat([df1, df2], ignore_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Combining Data with Overlap" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],", - " index=['f', 'e', 'd', 'c', 'b', 'a'])", - "b = pd.Series(np.arange(len(a), dtype=np.float64),", - " index=['f', 'e', 'd', 'c', 'b', 'a'])", - "b[-1] = np.nan", - "a", - "b", - "np.where(pd.isnull(a), b, a)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "b[:-2].combine_first(a[2:])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],", - " 'b': [np.nan, 2., np.nan, 6.],", - " 'c': range(2, 18, 4)})", - "df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],", - " 'b': [np.nan, 3., 4., 6., 8.]})", - "df1", - "df2", - "df1.combine_first(df2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Reshaping and Pivoting" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Reshaping with Hierarchical Indexing" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame(np.arange(6).reshape((2, 3)),", - " index=pd.Index(['Ohio', 'Colorado'], name='state'),", - " columns=pd.Index(['one', 'two', 'three'],", - " name='number'))", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = data.stack()", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result.unstack(0)", - "result.unstack('state')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])", - "s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])", - "data2 = pd.concat([s1, s2], keys=['one', 'two'])", - "data2", - "data2.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data2.unstack()", - "data2.unstack().stack()", - "data2.unstack().stack(dropna=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame({'left': result, 'right': result + 5},", - " columns=pd.Index(['left', 'right'], name='side'))", - "df", - "df.unstack('state')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.unstack('state').stack('side')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Pivoting \u201cLong\u201d to \u201cWide\u201d Format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_csv('examples/macrodata.csv')", - "data.head()", - "periods = pd.PeriodIndex(year=data.year, quarter=data.quarter,", - " name='date')", - "columns = pd.Index(['realgdp', 'infl', 'unemp'], name='item')", - "data = data.reindex(columns=columns)", - "data.index = periods.to_timestamp('D', 'end')", - "ldata = data.stack().reset_index().rename(columns={0: 'value'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ldata[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pivoted = ldata.pivot('date', 'item', 'value')", - "pivoted" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ldata['value2'] = np.random.randn(len(ldata))", - "ldata[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pivoted = ldata.pivot('date', 'item')", - "pivoted[:5]", - "pivoted['value'][:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "unstacked = ldata.set_index(['date', 'item']).unstack('item')", - "unstacked[:7]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Pivoting \u201cWide\u201d to \u201cLong\u201d Format" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],", - " 'A': [1, 2, 3],", - " 'B': [4, 5, 6],", - " 'C': [7, 8, 9]})", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "melted = pd.melt(df, ['key'])", - "melted" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "reshaped = melted.pivot('key', 'variable', 'value')", - "reshaped" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "reshaped.reset_index()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.melt(df, value_vars=['A', 'B', 'C'])", - "pd.melt(df, value_vars=['key', 'A', 'B'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch09.ipynb b/ch09.ipynb index c65a3c755..77abd662c 100644 --- a/ch09.ipynb +++ b/ch09.ipynb @@ -1,948 +1,1193 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Plotting and Visualization" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "%matplotlib notebook" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## A Brief matplotlib API Primer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "data = np.arange(10)\n", + "data\n", + "plt.plot(data)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Figures and Subplots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig = plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ax1 = fig.add_subplot(2, 2, 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ax2 = fig.add_subplot(2, 2, 2)\n", + "ax3 = fig.add_subplot(2, 2, 3)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "fig = plt.figure()\n", + "ax1 = fig.add_subplot(2, 2, 1)\n", + "ax2 = fig.add_subplot(2, 2, 2)\n", + "ax3 = fig.add_subplot(2, 2, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.plot(np.random.randn(50).cumsum(), 'k--')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "_ = ax1.hist(np.random.randn(100), bins=20, color='k', alpha=0.3)\n", + "ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 3)\n", + "axes" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Adjusting the spacing around subplots" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "subplots_adjust(left=None, bottom=None, right=None, top=None,\n", + " wspace=None, hspace=None)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", + "for i in range(2):\n", + " for j in range(2):\n", + " axes[i, j].hist(np.random.randn(500), bins=50, color='k', alpha=0.5)\n", + "plt.subplots_adjust(wspace=0, hspace=0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)\n", + "for i in range(2):\n", + " for j in range(2):\n", + " axes[i, j].hist(np.random.randn(500), bins=50, color='k', alpha=0.5)\n", + "plt.subplots_adjust(wspace=0, hspace=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Colors, Markers, and Line Styles" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ax.plot(x, y, 'g--')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ax.plot(x, y, linestyle='--', color='g')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from numpy.random import randn\n", + "plt.plot(randn(30).cumsum(), 'ko--')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "plot(randn(30).cumsum(), color='k', linestyle='dashed', marker='o')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = np.random.randn(30).cumsum()\n", + "plt.plot(data, 'k--', label='Default')\n", + "plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')\n", + "plt.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Ticks, Labels, and Legends" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Setting the title, axis labels, ticks, and ticklabels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig = plt.figure()\n", + "ax = fig.add_subplot(1, 1, 1)\n", + "ax.plot(np.random.randn(1000).cumsum())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ticks = ax.set_xticks([0, 250, 500, 750, 1000])\n", + "labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],\n", + " rotation=30, fontsize='small')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ax.set_title('My first matplotlib plot')\n", + "ax.set_xlabel('Stages')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "props = {\n", + " 'title': 'My first matplotlib plot',\n", + " 'xlabel': 'Stages'\n", + "}\n", + "ax.set(**props)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Adding legends" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from numpy.random import randn\n", + "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)\n", + "ax.plot(randn(1000).cumsum(), 'k', label='one')\n", + "ax.plot(randn(1000).cumsum(), 'k--', label='two')\n", + "ax.plot(randn(1000).cumsum(), 'k.', label='three')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ax.legend(loc='best')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Annotations and Drawing on a Subplot" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ax.text(x, y, 'Hello world!',\n", + " family='monospace', fontsize=10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "\n", + "fig = plt.figure()\n", + "ax = fig.add_subplot(1, 1, 1)\n", + "\n", + "data = pd.read_csv('examples/spx.csv', index_col=0, parse_dates=True)\n", + "spx = data['SPX']\n", + "\n", + "spx.plot(ax=ax, style='k-')\n", + "\n", + "crisis_data = [\n", + " (datetime(2007, 10, 11), 'Peak of bull market'),\n", + " (datetime(2008, 3, 12), 'Bear Stearns Fails'),\n", + " (datetime(2008, 9, 15), 'Lehman Bankruptcy')\n", + "]\n", + "\n", + "for date, label in crisis_data:\n", + " ax.annotate(label, xy=(date, spx.asof(date) + 75),\n", + " xytext=(date, spx.asof(date) + 225),\n", + " arrowprops=dict(facecolor='black', headwidth=4, width=2,\n", + " headlength=4),\n", + " horizontalalignment='left', verticalalignment='top')\n", + "\n", + "# Zoom in on 2007-2010\n", + "ax.set_xlim(['1/1/2007', '1/1/2011'])\n", + "ax.set_ylim([600, 1800])\n", + "\n", + "ax.set_title('Important dates in the 2008-2009 financial crisis')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ax.set_title('Important dates in the 2008–2009 financial crisis')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "fig = plt.figure()\n", + "ax = fig.add_subplot(1, 1, 1)\n", + "\n", + "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)\n", + "circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)\n", + "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],\n", + " color='g', alpha=0.5)\n", + "\n", + "ax.add_patch(rect)\n", + "ax.add_patch(circ)\n", + "ax.add_patch(pgon)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig = plt.figure(figsize=(12, 6)); ax = fig.add_subplot(1, 1, 1)\n", + "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)\n", + "circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)\n", + "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],\n", + " color='g', alpha=0.5)\n", + "ax.add_patch(rect)\n", + "ax.add_patch(circ)\n", + "ax.add_patch(pgon)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Saving Plots to File" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "plt.savefig('figpath.svg')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "plt.savefig('figpath.png', dpi=400, bbox_inches='tight')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "from io import BytesIO\n", + "buffer = BytesIO()\n", + "plt.savefig(buffer)\n", + "plot_data = buffer.getvalue()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### matplotlib Configuration" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "plt.rc('figure', figsize=(10, 10))" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "font_options = {'family' : 'monospace',\n", + " 'weight' : 'bold',\n", + " 'size' : 'small'}\n", + "plt.rc('font', **font_options)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Plotting with pandas and seaborn" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Line Plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))\n", + "s.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.random.randn(10, 4).cumsum(0),\n", + " columns=['A', 'B', 'C', 'D'],\n", + " index=np.arange(0, 100, 10))\n", + "df.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Bar Plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(2, 1)\n", + "data = pd.Series(np.random.rand(16), index=list('abcdefghijklmnop'))\n", + "data.plot.bar(ax=axes[0], color='k', alpha=0.7)\n", + "data.plot.barh(ax=axes[1], color='k', alpha=0.7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.random.seed(12348)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame(np.random.rand(6, 4),\n", + " index=['one', 'two', 'three', 'four', 'five', 'six'],\n", + " columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))\n", + "df\n", + "df.plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.plot.barh(stacked=True, alpha=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips = pd.read_csv('examples/tips.csv')\n", + "party_counts = pd.crosstab(tips['day'], tips['size'])\n", + "party_counts\n", + "# Not many 1- and 6-person parties\n", + "party_counts = party_counts.loc[:, 2:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Normalize to sum to 1\n", + "party_pcts = party_counts.div(party_counts.sum(1), axis=0)\n", + "party_pcts\n", + "party_pcts.plot.bar()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])\n", + "tips.head()\n", + "sns.barplot(x='tip_pct', y='day', data=tips, orient='h')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.set(style=\"whitegrid\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Histograms and Density Plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips['tip_pct'].plot.hist(bins=50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips['tip_pct'].plot.density()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "comp1 = np.random.normal(0, 1, size=200)\n", + "comp2 = np.random.normal(10, 2, size=200)\n", + "values = pd.Series(np.concatenate([comp1, comp2]))\n", + "sns.distplot(values, bins=100, color='k')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Scatter or Point Plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "macro = pd.read_csv('examples/macrodata.csv')\n", + "data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]\n", + "trans_data = np.log(data).diff().dropna()\n", + "trans_data[-5:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.regplot('m1', 'unemp', data=trans_data)\n", + "plt.title('Changes in log %s versus log %s' % ('m1', 'unemp'))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Facet Grids and Categorical Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.factorplot(x='day', y='tip_pct', hue='time', col='smoker',\n", + " kind='bar', data=tips[tips.tip_pct < 1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.factorplot(x='day', y='tip_pct', row='time',\n", + " col='smoker',\n", + " kind='bar', data=tips[tips.tip_pct < 1])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.factorplot(x='tip_pct', y='day', kind='box',\n", + " data=tips[tips.tip_pct < 0.5])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Other Python Visualization Tools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch09" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Plotting and Visualization" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "import matplotlib", - "plt.rc('figure', figsize=(10, 6))", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "%matplotlib notebook" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "A Brief matplotlib API Primer" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import matplotlib.pyplot as plt" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import numpy as np", - "data = np.arange(10)", - "data", - "plt.plot(data)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Figures and Subplots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ax1 = fig.add_subplot(2, 2, 1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ax2 = fig.add_subplot(2, 2, 2)", - "ax3 = fig.add_subplot(2, 2, 3)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "fig = plt.figure()", - "ax1 = fig.add_subplot(2, 2, 1)", - "ax2 = fig.add_subplot(2, 2, 2)", - "ax3 = fig.add_subplot(2, 2, 3)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.plot(np.random.randn(50).cumsum(), 'k--')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "_ = ax1.hist(np.random.randn(100), bins=20, color='k', alpha=0.3)", - "ax2.scatter(np.arange(30), np.arange(30) + 3 * np.random.randn(30))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(2, 3)", - "axes" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Adjusting the spacing around subplots" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "subplots_adjust(left=None, bottom=None, right=None, top=None,", - " wspace=None, hspace=None)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)", - "for i in range(2):", - " for j in range(2):", - " axes[i, j].hist(np.random.randn(500), bins=50, color='k', alpha=0.5)", - "plt.subplots_adjust(wspace=0, hspace=0)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)", - "for i in range(2):", - " for j in range(2):", - " axes[i, j].hist(np.random.randn(500), bins=50, color='k', alpha=0.5)", - "plt.subplots_adjust(wspace=0, hspace=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Colors, Markers, and Line Styles" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ax.plot(x, y, 'g--')" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ax.plot(x, y, linestyle='--', color='g')" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy.random import randn", - "plt.plot(randn(30).cumsum(), 'ko--')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "plot(randn(30).cumsum(), color='k', linestyle='dashed', marker='o')" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = np.random.randn(30).cumsum()", - "plt.plot(data, 'k--', label='Default')", - "plt.plot(data, 'k-', drawstyle='steps-post', label='steps-post')", - "plt.legend(loc='best')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Ticks, Labels, and Legends" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Setting the title, axis labels, ticks, and ticklabels" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()", - "ax = fig.add_subplot(1, 1, 1)", - "ax.plot(np.random.randn(1000).cumsum())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ticks = ax.set_xticks([0, 250, 500, 750, 1000])", - "labels = ax.set_xticklabels(['one', 'two', 'three', 'four', 'five'],", - " rotation=30, fontsize='small')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ax.set_title('My first matplotlib plot')", - "ax.set_xlabel('Stages')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "props = {", - " 'title': 'My first matplotlib plot',", - " 'xlabel': 'Stages'", - "}", - "ax.set(**props)" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Adding legends" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from numpy.random import randn", - "fig = plt.figure(); ax = fig.add_subplot(1, 1, 1)", - "ax.plot(randn(1000).cumsum(), 'k', label='one')", - "ax.plot(randn(1000).cumsum(), 'k--', label='two')", - "ax.plot(randn(1000).cumsum(), 'k.', label='three')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ax.legend(loc='best')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Annotations and Drawing on a Subplot" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ax.text(x, y, 'Hello world!',", - " family='monospace', fontsize=10)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime", - "", - "fig = plt.figure()", - "ax = fig.add_subplot(1, 1, 1)", - "", - "data = pd.read_csv('examples/spx.csv', index_col=0, parse_dates=True)", - "spx = data['SPX']", - "", - "spx.plot(ax=ax, style='k-')", - "", - "crisis_data = [", - " (datetime(2007, 10, 11), 'Peak of bull market'),", - " (datetime(2008, 3, 12), 'Bear Stearns Fails'),", - " (datetime(2008, 9, 15), 'Lehman Bankruptcy')", - "]", - "", - "for date, label in crisis_data:", - " ax.annotate(label, xy=(date, spx.asof(date) + 75),", - " xytext=(date, spx.asof(date) + 225),", - " arrowprops=dict(facecolor='black', headwidth=4, width=2,", - " headlength=4),", - " horizontalalignment='left', verticalalignment='top')", - "", - "# Zoom in on 2007-2010", - "ax.set_xlim(['1/1/2007', '1/1/2011'])", - "ax.set_ylim([600, 1800])", - "", - "ax.set_title('Important dates in the 2008-2009 financial crisis')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ax.set_title('Important dates in the 2008\u20132009 financial crisis')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "fig = plt.figure()", - "ax = fig.add_subplot(1, 1, 1)", - "", - "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)", - "circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)", - "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],", - " color='g', alpha=0.5)", - "", - "ax.add_patch(rect)", - "ax.add_patch(circ)", - "ax.add_patch(pgon)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure(figsize=(12, 6)); ax = fig.add_subplot(1, 1, 1)", - "rect = plt.Rectangle((0.2, 0.75), 0.4, 0.15, color='k', alpha=0.3)", - "circ = plt.Circle((0.7, 0.2), 0.15, color='b', alpha=0.3)", - "pgon = plt.Polygon([[0.15, 0.15], [0.35, 0.4], [0.2, 0.6]],", - " color='g', alpha=0.5)", - "ax.add_patch(rect)", - "ax.add_patch(circ)", - "ax.add_patch(pgon)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Saving Plots to File" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "plt.savefig('figpath.svg')" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "plt.savefig('figpath.png', dpi=400, bbox_inches='tight')" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "from io import BytesIO", - "buffer = BytesIO()", - "plt.savefig(buffer)", - "plot_data = buffer.getvalue()" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "matplotlib Configuration" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "plt.rc('figure', figsize=(10, 10))" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "font_options = {'family' : 'monospace',", - " 'weight' : 'bold',", - " 'size' : 'small'}", - "plt.rc('font', **font_options)" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Plotting with pandas and seaborn" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Line Plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = pd.Series(np.random.randn(10).cumsum(), index=np.arange(0, 100, 10))", - "s.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame(np.random.randn(10, 4).cumsum(0),", - " columns=['A', 'B', 'C', 'D'],", - " index=np.arange(0, 100, 10))", - "df.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Bar Plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig, axes = plt.subplots(2, 1)", - "data = pd.Series(np.random.rand(16), index=list('abcdefghijklmnop'))", - "data.plot.bar(ax=axes[0], color='k', alpha=0.7)", - "data.plot.barh(ax=axes[1], color='k', alpha=0.7)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12348)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame(np.random.rand(6, 4),", - " index=['one', 'two', 'three', 'four', 'five', 'six'],", - " columns=pd.Index(['A', 'B', 'C', 'D'], name='Genus'))", - "df", - "df.plot.bar()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.plot.barh(stacked=True, alpha=0.5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips = pd.read_csv('examples/tips.csv')", - "party_counts = pd.crosstab(tips['day'], tips['size'])", - "party_counts", - "# Not many 1- and 6-person parties", - "party_counts = party_counts.loc[:, 2:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Normalize to sum to 1", - "party_pcts = party_counts.div(party_counts.sum(1), axis=0)", - "party_pcts", - "party_pcts.plot.bar()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import seaborn as sns", - "tips['tip_pct'] = tips['tip'] / (tips['total_bill'] - tips['tip'])", - "tips.head()", - "sns.barplot(x='tip_pct', y='day', data=tips, orient='h')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.barplot(x='tip_pct', y='day', hue='time', data=tips, orient='h')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.set(style=\"whitegrid\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Histograms and Density Plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips['tip_pct'].plot.hist(bins=50)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips['tip_pct'].plot.density()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "comp1 = np.random.normal(0, 1, size=200)", - "comp2 = np.random.normal(10, 2, size=200)", - "values = pd.Series(np.concatenate([comp1, comp2]))", - "sns.distplot(values, bins=100, color='k')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Scatter or Point Plots" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "macro = pd.read_csv('examples/macrodata.csv')", - "data = macro[['cpi', 'm1', 'tbilrate', 'unemp']]", - "trans_data = np.log(data).diff().dropna()", - "trans_data[-5:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.regplot('m1', 'unemp', data=trans_data)", - "plt.title('Changes in log %s versus log %s' % ('m1', 'unemp'))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.pairplot(trans_data, diag_kind='kde', plot_kws={'alpha': 0.2})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Facet Grids and Categorical Data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.factorplot(x='day', y='tip_pct', hue='time', col='smoker',", - " kind='bar', data=tips[tips.tip_pct < 1])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.factorplot(x='day', y='tip_pct', row='time',", - " col='smoker',", - " kind='bar', data=tips[tips.tip_pct < 1])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.factorplot(x='tip_pct', y='day', kind='box',", - " data=tips[tips.tip_pct < 0.5])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Other Python Visualization Tools" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch10.ipynb b/ch10.ipynb index ca7c8e223..2c76ab822 100644 --- a/ch10.ipynb +++ b/ch10.ipynb @@ -1,1072 +1,1343 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Data Aggregation and Group Operations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## GroupBy Mechanics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],\n", + " 'key2' : ['one', 'two', 'one', 'two', 'one'],\n", + " 'data1' : np.random.randn(5),\n", + " 'data2' : np.random.randn(5)})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped = df['data1'].groupby(df['key1'])\n", + "grouped" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "means = df['data1'].groupby([df['key1'], df['key2']]).mean()\n", + "means" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "means.unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])\n", + "years = np.array([2005, 2005, 2006, 2005, 2006])\n", + "df['data1'].groupby([states, years]).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.groupby('key1').mean()\n", + "df.groupby(['key1', 'key2']).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.groupby(['key1', 'key2']).size()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Iterating Over Groups" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for name, group in df.groupby('key1'):\n", + " print(name)\n", + " print(group)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for (k1, k2), group in df.groupby(['key1', 'key2']):\n", + " print((k1, k2))\n", + " print(group)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pieces = dict(list(df.groupby('key1')))\n", + "pieces['b']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.dtypes\n", + "grouped = df.groupby(df.dtypes, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "for dtype, group in grouped:\n", + " print(dtype)\n", + " print(group)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Selecting a Column or Subset of Columns" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "df.groupby('key1')['data1']\n", + "df.groupby('key1')[['data2']]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "df['data1'].groupby(df['key1'])\n", + "df[['data2']].groupby(df['key1'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.groupby(['key1', 'key2'])[['data2']].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s_grouped = df.groupby(['key1', 'key2'])['data2']\n", + "s_grouped\n", + "s_grouped.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Grouping with Dicts and Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "people = pd.DataFrame(np.random.randn(5, 5),\n", + " columns=['a', 'b', 'c', 'd', 'e'],\n", + " index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])\n", + "people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values\n", + "people" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mapping = {'a': 'red', 'b': 'red', 'c': 'blue',\n", + " 'd': 'blue', 'e': 'red', 'f' : 'orange'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "by_column = people.groupby(mapping, axis=1)\n", + "by_column.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "map_series = pd.Series(mapping)\n", + "map_series\n", + "people.groupby(map_series, axis=1).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Grouping with Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "people.groupby(len).sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "key_list = ['one', 'one', 'one', 'two', 'two']\n", + "people.groupby([len, key_list]).min()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Grouping by Index Levels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],\n", + " [1, 3, 5, 1, 3]],\n", + " names=['cty', 'tenor'])\n", + "hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)\n", + "hier_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "hier_df.groupby(level='cty', axis=1).count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Data Aggregation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df\n", + "grouped = df.groupby('key1')\n", + "grouped['data1'].quantile(0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def peak_to_peak(arr):\n", + " return arr.max() - arr.min()\n", + "grouped.agg(peak_to_peak)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Column-Wise and Multiple Function Application" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips = pd.read_csv('examples/tips.csv')\n", + "# Add tip percentage of total bill\n", + "tips['tip_pct'] = tips['tip'] / tips['total_bill']\n", + "tips[:6]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped = tips.groupby(['day', 'smoker'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped_pct = grouped['tip_pct']\n", + "grouped_pct.agg('mean')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped_pct.agg(['mean', 'std', peak_to_peak])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "functions = ['count', 'mean', 'max']\n", + "result = grouped['tip_pct', 'total_bill'].agg(functions)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result['tip_pct']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]\n", + "grouped['tip_pct', 'total_bill'].agg(ftuples)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped.agg({'tip' : np.max, 'size' : 'sum'})\n", + "grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],\n", + " 'size' : 'sum'})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Returning Aggregated Data Without Row Indexes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.groupby(['day', 'smoker'], as_index=False).mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Apply: General split-apply-combine" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def top(df, n=5, column='tip_pct'):\n", + " return df.sort_values(by=column)[-n:]\n", + "top(tips, n=6)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.groupby('smoker').apply(top)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = tips.groupby('smoker')['tip_pct'].describe()\n", + "result\n", + "result.unstack('smoker')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "f = lambda x: x.describe()\n", + "grouped.apply(f)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Suppressing the Group Keys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.groupby('smoker', group_keys=False).apply(top)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Quantile and Bucket Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame({'data1': np.random.randn(1000),\n", + " 'data2': np.random.randn(1000)})\n", + "quartiles = pd.cut(frame.data1, 4)\n", + "quartiles[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def get_stats(group):\n", + " return {'min': group.min(), 'max': group.max(),\n", + " 'count': group.count(), 'mean': group.mean()}\n", + "grouped = frame.data2.groupby(quartiles)\n", + "grouped.apply(get_stats).unstack()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Return quantile numbers\n", + "grouping = pd.qcut(frame.data1, 10, labels=False)\n", + "grouped = frame.data2.groupby(grouping)\n", + "grouped.apply(get_stats).unstack()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Example: Filling Missing Values with Group-Specific Values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s = pd.Series(np.random.randn(6))\n", + "s[::2] = np.nan\n", + "s\n", + "s.fillna(s.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "states = ['Ohio', 'New York', 'Vermont', 'Florida',\n", + " 'Oregon', 'Nevada', 'California', 'Idaho']\n", + "group_key = ['East'] * 4 + ['West'] * 4\n", + "data = pd.Series(np.random.randn(8), index=states)\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data[['Vermont', 'Nevada', 'Idaho']] = np.nan\n", + "data\n", + "data.groupby(group_key).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fill_mean = lambda g: g.fillna(g.mean())\n", + "data.groupby(group_key).apply(fill_mean)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fill_values = {'East': 0.5, 'West': -1}\n", + "fill_func = lambda g: g.fillna(fill_values[g.name])\n", + "data.groupby(group_key).apply(fill_func)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Example: Random Sampling and Permutation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Hearts, Spades, Clubs, Diamonds\n", + "suits = ['H', 'S', 'C', 'D']\n", + "card_val = (list(range(1, 11)) + [10] * 3) * 4\n", + "base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']\n", + "cards = []\n", + "for suit in ['H', 'S', 'C', 'D']:\n", + " cards.extend(str(num) + suit for num in base_names)\n", + "\n", + "deck = pd.Series(card_val, index=cards)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "deck[:13]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def draw(deck, n=5):\n", + " return deck.sample(n)\n", + "draw(deck)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "get_suit = lambda card: card[-1] # last letter is suit\n", + "deck.groupby(get_suit).apply(draw, n=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "deck.groupby(get_suit, group_keys=False).apply(draw, n=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Example: Group Weighted Average and Correlation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',\n", + " 'b', 'b', 'b', 'b'],\n", + " 'data': np.random.randn(8),\n", + " 'weights': np.random.rand(8)})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped = df.groupby('category')\n", + "get_wavg = lambda g: np.average(g['data'], weights=g['weights'])\n", + "grouped.apply(get_wavg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True,\n", + " index_col=0)\n", + "close_px.info()\n", + "close_px[-4:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "spx_corr = lambda x: x.corrwith(x['SPX'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rets = close_px.pct_change().dropna()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "get_year = lambda x: x.year\n", + "by_year = rets.groupby(get_year)\n", + "by_year.apply(spx_corr)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Example: Group-Wise Linear Regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import statsmodels.api as sm\n", + "def regress(data, yvar, xvars):\n", + " Y = data[yvar]\n", + " X = data[xvars]\n", + " X['intercept'] = 1.\n", + " result = sm.OLS(Y, X).fit()\n", + " return result.params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "by_year.apply(regress, 'AAPL', ['SPX'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Pivot Tables and Cross-Tabulation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.pivot_table(index=['day', 'smoker'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],\n", + " columns='smoker')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],\n", + " columns='smoker', margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',\n", + " aggfunc=len, margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],\n", + " columns='day', aggfunc='mean', fill_value=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Cross-Tabulations: Crosstab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "data = \"\"\"\\\n", + "Sample Nationality Handedness\n", + "1 USA Right-handed\n", + "2 Japan Left-handed\n", + "3 USA Right-handed\n", + "4 Japan Right-handed\n", + "5 Japan Left-handed\n", + "6 Japan Right-handed\n", + "7 USA Right-handed\n", + "8 USA Left-handed\n", + "9 Japan Right-handed\n", + "10 USA Right-handed\"\"\"\n", + "data = pd.read_table(StringIO(data), sep='\\s+')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.crosstab(data.Nationality, data.Handedness, margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch10" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Data Aggregation and " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "GroupBy Mechanics" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],", - " 'key2' : ['one', 'two', 'one', 'two', 'one'],", - " 'data1' : np.random.randn(5),", - " 'data2' : np.random.randn(5)})", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = df['data1'].groupby(df['key1'])", - "grouped" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "means = df['data1'].groupby([df['key1'], df['key2']]).mean()", - "means" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "means.unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = np.array(['Ohio', 'California', 'California', 'Ohio', 'Ohio'])", - "years = np.array([2005, 2005, 2006, 2005, 2006])", - "df['data1'].groupby([states, years]).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.groupby('key1').mean()", - "df.groupby(['key1', 'key2']).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.groupby(['key1', 'key2']).size()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Iterating Over Groups" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for name, group in df.groupby('key1'):", - " print(name)", - " print(group)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for (k1, k2), group in df.groupby(['key1', 'key2']):", - " print((k1, k2))", - " print(group)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pieces = dict(list(df.groupby('key1')))", - "pieces['b']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.dtypes", - "grouped = df.groupby(df.dtypes, axis=1)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "for dtype, group in grouped:", - " print(dtype)", - " print(group)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Selecting a Column or Subset of Columns" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "df.groupby('key1')['data1']", - "df.groupby('key1')[['data2']]" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "df['data1'].groupby(df['key1'])", - "df[['data2']].groupby(df['key1'])" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.groupby(['key1', 'key2'])[['data2']].mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s_grouped = df.groupby(['key1', 'key2'])['data2']", - "s_grouped", - "s_grouped.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Grouping with Dicts and Series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "people = pd.DataFrame(np.random.randn(5, 5),", - " columns=['a', 'b', 'c', 'd', 'e'],", - " index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])", - "people.iloc[2:3, [1, 2]] = np.nan # Add a few NA values", - "people" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mapping = {'a': 'red', 'b': 'red', 'c': 'blue',", - " 'd': 'blue', 'e': 'red', 'f' : 'orange'}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_column = people.groupby(mapping, axis=1)", - "by_column.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "map_series = pd.Series(mapping)", - "map_series", - "people.groupby(map_series, axis=1).count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Grouping with Functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "people.groupby(len).sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "key_list = ['one', 'one', 'one', 'two', 'two']", - "people.groupby([len, key_list]).min()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Grouping by Index Levels" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "columns = pd.MultiIndex.from_arrays([['US', 'US', 'US', 'JP', 'JP'],", - " [1, 3, 5, 1, 3]],", - " names=['cty', 'tenor'])", - "hier_df = pd.DataFrame(np.random.randn(4, 5), columns=columns)", - "hier_df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "hier_df.groupby(level='cty', axis=1).count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Data Aggregation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df", - "grouped = df.groupby('key1')", - "grouped['data1'].quantile(0.9)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def peak_to_peak(arr):", - " return arr.max() - arr.min()", - "grouped.agg(peak_to_peak)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.describe()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Column-Wise and Multiple Function Application" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips = pd.read_csv('examples/tips.csv')", - "# Add tip percentage of total bill", - "tips['tip_pct'] = tips['tip'] / tips['total_bill']", - "tips[:6]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = tips.groupby(['day', 'smoker'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped_pct = grouped['tip_pct']", - "grouped_pct.agg('mean')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped_pct.agg(['mean', 'std', peak_to_peak])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped_pct.agg([('foo', 'mean'), ('bar', np.std)])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "functions = ['count', 'mean', 'max']", - "result = grouped['tip_pct', 'total_bill'].agg(functions)", - "result" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result['tip_pct']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ftuples = [('Durchschnitt', 'mean'), ('Abweichung', np.var)]", - "grouped['tip_pct', 'total_bill'].agg(ftuples)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped.agg({'tip' : np.max, 'size' : 'sum'})", - "grouped.agg({'tip_pct' : ['min', 'max', 'mean', 'std'],", - " 'size' : 'sum'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Returning Aggregated Data Without Row Indexes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby(['day', 'smoker'], as_index=False).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Apply: General split-apply-combine" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def top(df, n=5, column='tip_pct'):", - " return df.sort_values(by=column)[-n:]", - "top(tips, n=6)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby('smoker').apply(top)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby(['smoker', 'day']).apply(top, n=1, column='total_bill')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = tips.groupby('smoker')['tip_pct'].describe()", - "result", - "result.unstack('smoker')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "f = lambda x: x.describe()", - "grouped.apply(f)" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Suppressing the Group Keys" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.groupby('smoker', group_keys=False).apply(top)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Quantile and Bucket Analysis" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame({'data1': np.random.randn(1000),", - " 'data2': np.random.randn(1000)})", - "quartiles = pd.cut(frame.data1, 4)", - "quartiles[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_stats(group):", - " return {'min': group.min(), 'max': group.max(),", - " 'count': group.count(), 'mean': group.mean()}", - "grouped = frame.data2.groupby(quartiles)", - "grouped.apply(get_stats).unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Return quantile numbers", - "grouping = pd.qcut(frame.data1, 10, labels=False)", - "grouped = frame.data2.groupby(grouping)", - "grouped.apply(get_stats).unstack()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Filling Missing Values with Group-Specific\n Values" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = pd.Series(np.random.randn(6))", - "s[::2] = np.nan", - "s", - "s.fillna(s.mean())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "states = ['Ohio', 'New York', 'Vermont', 'Florida',", - " 'Oregon', 'Nevada', 'California', 'Idaho']", - "group_key = ['East'] * 4 + ['West'] * 4", - "data = pd.Series(np.random.randn(8), index=states)", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data[['Vermont', 'Nevada', 'Idaho']] = np.nan", - "data", - "data.groupby(group_key).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fill_mean = lambda g: g.fillna(g.mean())", - "data.groupby(group_key).apply(fill_mean)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fill_values = {'East': 0.5, 'West': -1}", - "fill_func = lambda g: g.fillna(fill_values[g.name])", - "data.groupby(group_key).apply(fill_func)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Random Sampling and Permutation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Hearts, Spades, Clubs, Diamonds", - "suits = ['H', 'S', 'C', 'D']", - "card_val = (list(range(1, 11)) + [10] * 3) * 4", - "base_names = ['A'] + list(range(2, 11)) + ['J', 'K', 'Q']", - "cards = []", - "for suit in ['H', 'S', 'C', 'D']:", - " cards.extend(str(num) + suit for num in base_names)", - "", - "deck = pd.Series(card_val, index=cards)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "deck[:13]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def draw(deck, n=5):", - " return deck.sample(n)", - "draw(deck)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "get_suit = lambda card: card[-1] # last letter is suit", - "deck.groupby(get_suit).apply(draw, n=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "deck.groupby(get_suit, group_keys=False).apply(draw, n=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Group Weighted Average and Correlation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame({'category': ['a', 'a', 'a', 'a',", - " 'b', 'b', 'b', 'b'],", - " 'data': np.random.randn(8),", - " 'weights': np.random.rand(8)})", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = df.groupby('category')", - "get_wavg = lambda g: np.average(g['data'], weights=g['weights'])", - "grouped.apply(get_wavg)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px = pd.read_csv('examples/stock_px_2.csv', parse_dates=True,", - " index_col=0)", - "close_px.info()", - "close_px[-4:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "spx_corr = lambda x: x.corrwith(x['SPX'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rets = close_px.pct_change().dropna()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "get_year = lambda x: x.year", - "by_year = rets.groupby(get_year)", - "by_year.apply(spx_corr)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_year.apply(lambda g: g['AAPL'].corr(g['MSFT']))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Example: Group-Wise Linear Regression" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import statsmodels.api as sm", - "def regress(data, yvar, xvars):", - " Y = data[yvar]", - " X = data[xvars]", - " X['intercept'] = 1.", - " result = sm.OLS(Y, X).fit()", - " return result.params" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_year.apply(regress, 'AAPL', ['SPX'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Pivot Tables and Cross-Tabulation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table(index=['day', 'smoker'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],", - " columns='smoker')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table(['tip_pct', 'size'], index=['time', 'day'],", - " columns='smoker', margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table('tip_pct', index=['time', 'smoker'], columns='day',", - " aggfunc=len, margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tips.pivot_table('tip_pct', index=['time', 'size', 'smoker'],", - " columns='day', aggfunc='mean', fill_value=0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Cross-Tabulations: Crosstab" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from io import StringIO", - "data = \"\"\"\\", - "Sample Nationality Handedness", - "1 USA Right-handed", - "2 Japan Left-handed", - "3 USA Right-handed", - "4 Japan Right-handed", - "5 Japan Left-handed", - "6 Japan Right-handed", - "7 USA Right-handed", - "8 USA Left-handed", - "9 Japan Right-handed", - "10 USA Right-handed\"\"\"", - "data = pd.read_table(StringIO(data), sep='\\s+')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.crosstab(data.Nationality, data.Handedness, margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.crosstab([tips.time, tips.day], tips.smoker, margins=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch11.ipynb b/ch11.ipynb index 7a6c7f85f..d943cf331 100644 --- a/ch11.ipynb +++ b/ch11.ipynb @@ -1,1611 +1,2024 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Time Series" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Date and Time Data Types and Tools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "now = datetime.now()\n", + "now\n", + "now.year, now.month, now.day" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)\n", + "delta\n", + "delta.days\n", + "delta.seconds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from datetime import timedelta\n", + "start = datetime(2011, 1, 7)\n", + "start + timedelta(12)\n", + "start - 2 * timedelta(12)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Converting Between String and Datetime" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "stamp = datetime(2011, 1, 3)\n", + "str(stamp)\n", + "stamp.strftime('%Y-%m-%d')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "value = '2011-01-03'\n", + "datetime.strptime(value, '%Y-%m-%d')\n", + "datestrs = ['7/6/2011', '8/6/2011']\n", + "[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from dateutil.parser import parse\n", + "parse('2011-01-03')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "parse('Jan 31, 1997 10:45 PM')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "parse('6/12/2011', dayfirst=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']\n", + "pd.to_datetime(datestrs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "idx = pd.to_datetime(datestrs + [None])\n", + "idx\n", + "idx[2]\n", + "pd.isnull(idx)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Time Series Basics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),\n", + " datetime(2011, 1, 7), datetime(2011, 1, 8),\n", + " datetime(2011, 1, 10), datetime(2011, 1, 12)]\n", + "ts = pd.Series(np.random.randn(6), index=dates)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts + ts[::2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.index.dtype" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "stamp = ts.index[0]\n", + "stamp" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Indexing, Selection, Subsetting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "stamp = ts.index[2]\n", + "ts[stamp]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts['1/10/2011']\n", + "ts['20110110']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "longer_ts = pd.Series(np.random.randn(1000),\n", + " index=pd.date_range('1/1/2000', periods=1000))\n", + "longer_ts\n", + "longer_ts['2001']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "longer_ts['2001-05']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts[datetime(2011, 1, 7):]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts\n", + "ts['1/6/2011':'1/11/2011']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.truncate(after='1/9/2011')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')\n", + "long_df = pd.DataFrame(np.random.randn(100, 4),\n", + " index=dates,\n", + " columns=['Colorado', 'Texas',\n", + " 'New York', 'Ohio'])\n", + "long_df.loc['5-2001']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Time Series with Duplicate Indices" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',\n", + " '1/2/2000', '1/3/2000'])\n", + "dup_ts = pd.Series(np.arange(5), index=dates)\n", + "dup_ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dup_ts.index.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dup_ts['1/3/2000'] # not duplicated\n", + "dup_ts['1/2/2000'] # duplicated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped = dup_ts.groupby(level=0)\n", + "grouped.mean()\n", + "grouped.count()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Date Ranges, Frequencies, and Shifting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts\n", + "resampler = ts.resample('D')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Generating Date Ranges" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "index = pd.date_range('2012-04-01', '2012-06-01')\n", + "index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.date_range(start='2012-04-01', periods=20)\n", + "pd.date_range(end='2012-06-01', periods=20)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.date_range('2000-01-01', '2000-12-01', freq='BM')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.date_range('2012-05-02 12:56:31', periods=5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Frequencies and Date Offsets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from pandas.tseries.offsets import Hour, Minute\n", + "hour = Hour()\n", + "hour" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "four_hours = Hour(4)\n", + "four_hours" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4h')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "Hour(2) + Minute(30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.date_range('2000-01-01', periods=10, freq='1h30min')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Week of month dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')\n", + "list(rng)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Shifting (Leading and Lagging) Data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts = pd.Series(np.random.randn(4),\n", + " index=pd.date_range('1/1/2000', periods=4, freq='M'))\n", + "ts\n", + "ts.shift(2)\n", + "ts.shift(-2)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "ts / ts.shift(1) - 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.shift(2, freq='M')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.shift(3, freq='D')\n", + "ts.shift(1, freq='90T')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Shifting dates with offsets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from pandas.tseries.offsets import Day, MonthEnd\n", + "now = datetime(2011, 11, 17)\n", + "now + 3 * Day()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "now + MonthEnd()\n", + "now + MonthEnd(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "offset = MonthEnd()\n", + "offset.rollforward(now)\n", + "offset.rollback(now)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts = pd.Series(np.random.randn(20),\n", + " index=pd.date_range('1/15/2000', periods=20, freq='4d'))\n", + "ts\n", + "ts.groupby(offset.rollforward).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.resample('M').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Time Zone Handling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import pytz\n", + "pytz.common_timezones[-5:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tz = pytz.timezone('America/New_York')\n", + "tz" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Time Zone Localization and Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')\n", + "ts = pd.Series(np.random.randn(len(rng)), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "print(ts.index.tz)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts\n", + "ts_utc = ts.tz_localize('UTC')\n", + "ts_utc\n", + "ts_utc.index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts_utc.tz_convert('America/New_York')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts_eastern = ts.tz_localize('America/New_York')\n", + "ts_eastern.tz_convert('UTC')\n", + "ts_eastern.tz_convert('Europe/Berlin')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.index.tz_localize('Asia/Shanghai')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Operations with Time Zone−Aware Timestamp Objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "stamp = pd.Timestamp('2011-03-12 04:00')\n", + "stamp_utc = stamp.tz_localize('utc')\n", + "stamp_utc.tz_convert('America/New_York')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')\n", + "stamp_moscow" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "stamp_utc.value\n", + "stamp_utc.tz_convert('America/New_York').value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from pandas.tseries.offsets import Hour\n", + "stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')\n", + "stamp\n", + "stamp + Hour()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')\n", + "stamp\n", + "stamp + 2 * Hour()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Operations Between Different Time Zones" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')\n", + "ts = pd.Series(np.random.randn(len(rng)), index=rng)\n", + "ts\n", + "ts1 = ts[:7].tz_localize('Europe/London')\n", + "ts2 = ts1[2:].tz_convert('Europe/Moscow')\n", + "result = ts1 + ts2\n", + "result.index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Periods and Period Arithmetic" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p = pd.Period(2007, freq='A-DEC')\n", + "p" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p + 5\n", + "p - 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.Period('2014', freq='A-DEC') - p" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.period_range('2000-01-01', '2000-06-30', freq='M')\n", + "rng" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.Series(np.random.randn(6), index=rng)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "values = ['2001Q3', '2002Q2', '2003Q1']\n", + "index = pd.PeriodIndex(values, freq='Q-DEC')\n", + "index" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Period Frequency Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p = pd.Period('2007', freq='A-DEC')\n", + "p\n", + "p.asfreq('M', how='start')\n", + "p.asfreq('M', how='end')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p = pd.Period('2007', freq='A-JUN')\n", + "p\n", + "p.asfreq('M', 'start')\n", + "p.asfreq('M', 'end')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p = pd.Period('Aug-2007', 'M')\n", + "p.asfreq('A-JUN')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.period_range('2006', '2009', freq='A-DEC')\n", + "ts = pd.Series(np.random.randn(len(rng)), index=rng)\n", + "ts\n", + "ts.asfreq('M', how='start')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.asfreq('B', how='end')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Quarterly Period Frequencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p = pd.Period('2012Q4', freq='Q-JAN')\n", + "p" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p.asfreq('D', 'start')\n", + "p.asfreq('D', 'end')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60\n", + "p4pm\n", + "p4pm.to_timestamp()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')\n", + "ts = pd.Series(np.arange(len(rng)), index=rng)\n", + "ts\n", + "new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60\n", + "ts.index = new_rng.to_timestamp()\n", + "ts" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Converting Timestamps to Periods (and Back)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.date_range('2000-01-01', periods=3, freq='M')\n", + "ts = pd.Series(np.random.randn(3), index=rng)\n", + "ts\n", + "pts = ts.to_period()\n", + "pts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.date_range('1/29/2000', periods=6, freq='D')\n", + "ts2 = pd.Series(np.random.randn(6), index=rng)\n", + "ts2\n", + "ts2.to_period('M')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pts = ts2.to_period()\n", + "pts\n", + "pts.to_timestamp(how='end')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Creating a PeriodIndex from Arrays" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.read_csv('examples/macrodata.csv')\n", + "data.head(5)\n", + "data.year\n", + "data.quarter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "index = pd.PeriodIndex(year=data.year, quarter=data.quarter,\n", + " freq='Q-DEC')\n", + "index\n", + "data.index = index\n", + "data.infl" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Resampling and Frequency Conversion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.date_range('2000-01-01', periods=100, freq='D')\n", + "ts = pd.Series(np.random.randn(len(rng)), index=rng)\n", + "ts\n", + "ts.resample('M').mean()\n", + "ts.resample('M', kind='period').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Downsampling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "rng = pd.date_range('2000-01-01', periods=12, freq='T')\n", + "ts = pd.Series(np.arange(12), index=rng)\n", + "ts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.resample('5min', closed='right').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.resample('5min', closed='right').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.resample('5min', closed='right', label='right').sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.resample('5min', closed='right',\n", + " label='right', loffset='-1s').sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Open-High-Low-Close (OHLC) resampling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ts.resample('5min').ohlc()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Upsampling and Interpolation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.random.randn(2, 4),\n", + " index=pd.date_range('1/1/2000', periods=2,\n", + " freq='W-WED'),\n", + " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", + "frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df_daily = frame.resample('D').asfreq()\n", + "df_daily" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.resample('D').ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.resample('D').ffill(limit=2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame.resample('W-THU').ffill()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Resampling with Periods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame = pd.DataFrame(np.random.randn(24, 4),\n", + " index=pd.period_range('1-2000', '12-2001',\n", + " freq='M'),\n", + " columns=['Colorado', 'Texas', 'New York', 'Ohio'])\n", + "frame[:5]\n", + "annual_frame = frame.resample('A-DEC').mean()\n", + "annual_frame" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Q-DEC: Quarterly, year ending in December\n", + "annual_frame.resample('Q-DEC').ffill()\n", + "annual_frame.resample('Q-DEC', convention='end').ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "annual_frame.resample('Q-MAR').ffill()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Moving Window Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "close_px_all = pd.read_csv('examples/stock_px_2.csv',\n", + " parse_dates=True, index_col=0)\n", + "close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]\n", + "close_px = close_px.resample('B').ffill()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "close_px.AAPL.plot()\n", + "close_px.AAPL.rolling(250).mean().plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "appl_std250 = close_px.AAPL.rolling(250, min_periods=10).std()\n", + "appl_std250[5:12]\n", + "appl_std250.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "expanding_mean = appl_std250.expanding().mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "close_px.rolling(60).mean().plot(logy=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "close_px.rolling('20D').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Exponentially Weighted Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "aapl_px = close_px.AAPL['2006':'2007']\n", + "ma60 = aapl_px.rolling(30, min_periods=20).mean()\n", + "ewma60 = aapl_px.ewm(span=30).mean()\n", + "ma60.plot(style='k--', label='Simple MA')\n", + "ewma60.plot(style='k-', label='EW MA')\n", + "plt.legend()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Binary Moving Window Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "spx_px = close_px_all['SPX']\n", + "spx_rets = spx_px.pct_change()\n", + "returns = close_px.pct_change()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "corr = returns.AAPL.rolling(125, min_periods=100).corr(spx_rets)\n", + "corr.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "corr = returns.rolling(125, min_periods=100).corr(spx_rets)\n", + "corr.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### User-Defined Moving Window Functions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from scipy.stats import percentileofscore\n", + "score_at_2percent = lambda x: percentileofscore(x, 0.02)\n", + "result = returns.AAPL.rolling(250).apply(score_at_2percent)\n", + "result.plot()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch11" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Time Series" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Date and Time Data Types and Tools" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime", - "now = datetime.now()", - "now", - "now.year, now.month, now.day" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)", - "delta", - "delta.days", - "delta.seconds" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import timedelta", - "start = datetime(2011, 1, 7)", - "start + timedelta(12)", - "start - 2 * timedelta(12)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Converting Between String and Datetime" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = datetime(2011, 1, 3)", - "str(stamp)", - "stamp.strftime('%Y-%m-%d')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "value = '2011-01-03'", - "datetime.strptime(value, '%Y-%m-%d')", - "datestrs = ['7/6/2011', '8/6/2011']", - "[datetime.strptime(x, '%m/%d/%Y') for x in datestrs]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from dateutil.parser import parse", - "parse('2011-01-03')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "parse('Jan 31, 1997 10:45 PM')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "parse('6/12/2011', dayfirst=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "datestrs = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']", - "pd.to_datetime(datestrs)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "idx = pd.to_datetime(datestrs + [None])", - "idx", - "idx[2]", - "pd.isnull(idx)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Time Series Basics" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from datetime import datetime", - "dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),", - " datetime(2011, 1, 7), datetime(2011, 1, 8),", - " datetime(2011, 1, 10), datetime(2011, 1, 12)]", - "ts = pd.Series(np.random.randn(6), index=dates)", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts + ts[::2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.index.dtype" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = ts.index[0]", - "stamp" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Indexing, Selection, Subsetting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = ts.index[2]", - "ts[stamp]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts['1/10/2011']", - "ts['20110110']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "longer_ts = pd.Series(np.random.randn(1000),", - " index=pd.date_range('1/1/2000', periods=1000))", - "longer_ts", - "longer_ts['2001']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "longer_ts['2001-05']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts[datetime(2011, 1, 7):]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts", - "ts['1/6/2011':'1/11/2011']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.truncate(after='1/9/2011')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dates = pd.date_range('1/1/2000', periods=100, freq='W-WED')", - "long_df = pd.DataFrame(np.random.randn(100, 4),", - " index=dates,", - " columns=['Colorado', 'Texas',", - " 'New York', 'Ohio'])", - "long_df.loc['5-2001']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Time Series with Duplicate Indices" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dates = pd.DatetimeIndex(['1/1/2000', '1/2/2000', '1/2/2000',", - " '1/2/2000', '1/3/2000'])", - "dup_ts = pd.Series(np.arange(5), index=dates)", - "dup_ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dup_ts.index.is_unique" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dup_ts['1/3/2000'] # not duplicated", - "dup_ts['1/2/2000'] # duplicated" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = dup_ts.groupby(level=0)", - "grouped.mean()", - "grouped.count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Date Ranges, Frequencies, and Shifting" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts", - "resampler = ts.resample('D')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Generating Date Ranges" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "index = pd.date_range('2012-04-01', '2012-06-01')", - "index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range(start='2012-04-01', periods=20)", - "pd.date_range(end='2012-06-01', periods=20)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('2000-01-01', '2000-12-01', freq='BM')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('2012-05-02 12:56:31', periods=5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('2012-05-02 12:56:31', periods=5, normalize=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Frequencies and Date Offsets" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas.tseries.offsets import Hour, Minute", - "hour = Hour()", - "hour" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "four_hours = Hour(4)", - "four_hours" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('2000-01-01', '2000-01-03 23:59', freq='4h')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "Hour(2) + Minute(30)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('2000-01-01', periods=10, freq='1h30min')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Week of month dates" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('2012-01-01', '2012-09-01', freq='WOM-3FRI')", - "list(rng)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Shifting (Leading and Lagging) Data" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts = pd.Series(np.random.randn(4),", - " index=pd.date_range('1/1/2000', periods=4, freq='M'))", - "ts", - "ts.shift(2)", - "ts.shift(-2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "ts / ts.shift(1) - 1" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(2, freq='M')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.shift(3, freq='D')", - "ts.shift(1, freq='90T')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Shifting dates with offsets" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas.tseries.offsets import Day, MonthEnd", - "now = datetime(2011, 11, 17)", - "now + 3 * Day()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "now + MonthEnd()", - "now + MonthEnd(2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "offset = MonthEnd()", - "offset.rollforward(now)", - "offset.rollback(now)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts = pd.Series(np.random.randn(20),", - " index=pd.date_range('1/15/2000', periods=20, freq='4d'))", - "ts", - "ts.groupby(offset.rollforward).mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('M').mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Time Zone Handling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pytz", - "pytz.common_timezones[-5:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tz = pytz.timezone('America/New_York')", - "tz" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Time Zone Localization and Conversion" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('3/9/2012 9:30', periods=6, freq='D')", - "ts = pd.Series(np.random.randn(len(rng)), index=rng)", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(ts.index.tz)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.date_range('3/9/2012 9:30', periods=10, freq='D', tz='UTC')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts", - "ts_utc = ts.tz_localize('UTC')", - "ts_utc", - "ts_utc.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts_utc.tz_convert('America/New_York')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts_eastern = ts.tz_localize('America/New_York')", - "ts_eastern.tz_convert('UTC')", - "ts_eastern.tz_convert('Europe/Berlin')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.index.tz_localize('Asia/Shanghai')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Operations with Time Zone\u2212Aware Timestamp Objects" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = pd.Timestamp('2011-03-12 04:00')", - "stamp_utc = stamp.tz_localize('utc')", - "stamp_utc.tz_convert('America/New_York')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp_moscow = pd.Timestamp('2011-03-12 04:00', tz='Europe/Moscow')", - "stamp_moscow" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp_utc.value", - "stamp_utc.tz_convert('America/New_York').value" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from pandas.tseries.offsets import Hour", - "stamp = pd.Timestamp('2012-03-12 01:30', tz='US/Eastern')", - "stamp", - "stamp + Hour()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "stamp = pd.Timestamp('2012-11-04 00:30', tz='US/Eastern')", - "stamp", - "stamp + 2 * Hour()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Operations Between Different Time Zones" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('3/7/2012 9:30', periods=10, freq='B')", - "ts = pd.Series(np.random.randn(len(rng)), index=rng)", - "ts", - "ts1 = ts[:7].tz_localize('Europe/London')", - "ts2 = ts1[2:].tz_convert('Europe/Moscow')", - "result = ts1 + ts2", - "result.index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Periods and Period Arithmetic" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period(2007, freq='A-DEC')", - "p" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p + 5", - "p - 2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.Period('2014', freq='A-DEC') - p" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.period_range('2000-01-01', '2000-06-30', freq='M')", - "rng" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.Series(np.random.randn(6), index=rng)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = ['2001Q3', '2002Q2', '2003Q1']", - "index = pd.PeriodIndex(values, freq='Q-DEC')", - "index" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Period Frequency Conversion" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('2007', freq='A-DEC')", - "p", - "p.asfreq('M', how='start')", - "p.asfreq('M', how='end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('2007', freq='A-JUN')", - "p", - "p.asfreq('M', 'start')", - "p.asfreq('M', 'end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('Aug-2007', 'M')", - "p.asfreq('A-JUN')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.period_range('2006', '2009', freq='A-DEC')", - "ts = pd.Series(np.random.randn(len(rng)), index=rng)", - "ts", - "ts.asfreq('M', how='start')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.asfreq('B', how='end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Quarterly Period Frequencies" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p = pd.Period('2012Q4', freq='Q-JAN')", - "p" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p.asfreq('D', 'start')", - "p.asfreq('D', 'end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "p4pm = (p.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60", - "p4pm", - "p4pm.to_timestamp()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.period_range('2011Q3', '2012Q4', freq='Q-JAN')", - "ts = pd.Series(np.arange(len(rng)), index=rng)", - "ts", - "new_rng = (rng.asfreq('B', 'e') - 1).asfreq('T', 's') + 16 * 60", - "ts.index = new_rng.to_timestamp()", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Converting Timestamps to Periods (and Back)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('2000-01-01', periods=3, freq='M')", - "ts = pd.Series(np.random.randn(3), index=rng)", - "ts", - "pts = ts.to_period()", - "pts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('1/29/2000', periods=6, freq='D')", - "ts2 = pd.Series(np.random.randn(6), index=rng)", - "ts2", - "ts2.to_period('M')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pts = ts2.to_period()", - "pts", - "pts.to_timestamp(how='end')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Creating a PeriodIndex from Arrays" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.read_csv('examples/macrodata.csv')", - "data.head(5)", - "data.year", - "data.quarter" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "index = pd.PeriodIndex(year=data.year, quarter=data.quarter,", - " freq='Q-DEC')", - "index", - "data.index = index", - "data.infl" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Resampling and Frequency Conversion" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('2000-01-01', periods=100, freq='D')", - "ts = pd.Series(np.random.randn(len(rng)), index=rng)", - "ts", - "ts.resample('M').mean()", - "ts.resample('M', kind='period').mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Downsampling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "rng = pd.date_range('2000-01-01', periods=12, freq='T')", - "ts = pd.Series(np.arange(12), index=rng)", - "ts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', closed='right').sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', closed='right').sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', closed='right', label='right').sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min', closed='right',", - " label='right', loffset='-1s').sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Open-High-Low-Close (OHLC) resampling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ts.resample('5min').ohlc()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Upsampling and Interpolation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame(np.random.randn(2, 4),", - " index=pd.date_range('1/1/2000', periods=2,", - " freq='W-WED'),", - " columns=['Colorado', 'Texas', 'New York', 'Ohio'])", - "frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df_daily = frame.resample('D').asfreq()", - "df_daily" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.resample('D').ffill()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.resample('D').ffill(limit=2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame.resample('W-THU').ffill()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Resampling with Periods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame = pd.DataFrame(np.random.randn(24, 4),", - " index=pd.period_range('1-2000', '12-2001',", - " freq='M'),", - " columns=['Colorado', 'Texas', 'New York', 'Ohio'])", - "frame[:5]", - "annual_frame = frame.resample('A-DEC').mean()", - "annual_frame" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Q-DEC: Quarterly, year ending in December", - "annual_frame.resample('Q-DEC').ffill()", - "annual_frame.resample('Q-DEC', convention='end').ffill()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "annual_frame.resample('Q-MAR').ffill()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Moving Window Functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px_all = pd.read_csv('examples/stock_px_2.csv',", - " parse_dates=True, index_col=0)", - "close_px = close_px_all[['AAPL', 'MSFT', 'XOM']]", - "close_px = close_px.resample('B').ffill()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px.AAPL.plot()", - "close_px.AAPL.rolling(250).mean().plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "appl_std250 = close_px.AAPL.rolling(250, min_periods=10).std()", - "appl_std250[5:12]", - "appl_std250.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "expanding_mean = appl_std250.expanding().mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px.rolling(60).mean().plot(logy=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "close_px.rolling('20D').mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Exponentially Weighted Functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "aapl_px = close_px.AAPL['2006':'2007']", - "ma60 = aapl_px.rolling(30, min_periods=20).mean()", - "ewma60 = aapl_px.ewm(span=30).mean()", - "ma60.plot(style='k--', label='Simple MA')", - "ewma60.plot(style='k-', label='EW MA')", - "plt.legend()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Binary Moving Window Functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "spx_px = close_px_all['SPX']", - "spx_rets = spx_px.pct_change()", - "returns = close_px.pct_change()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "corr = returns.AAPL.rolling(125, min_periods=100).corr(spx_rets)", - "corr.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "corr = returns.rolling(125, min_periods=100).corr(spx_rets)", - "corr.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "User-Defined Moving Window Functions" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from scipy.stats import percentileofscore", - "score_at_2percent = lambda x: percentileofscore(x, 0.02)", - "result = returns.AAPL.rolling(250).apply(score_at_2percent)", - "result.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch12.ipynb b/ch12.ipynb index 13e37098e..c5e3d76c5 100644 --- a/ch12.ipynb +++ b/ch12.ipynb @@ -1,752 +1,945 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Advanced pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Categorical Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Background and Motivation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np; import pandas as pd\n", + "values = pd.Series(['apple', 'orange', 'apple',\n", + " 'apple'] * 2)\n", + "values\n", + "pd.unique(values)\n", + "pd.value_counts(values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "values = pd.Series([0, 1, 0, 0] * 2)\n", + "dim = pd.Series(['apple', 'orange'])\n", + "values\n", + "dim" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dim.take(values)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Categorical Type in pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fruits = ['apple', 'orange', 'apple', 'apple'] * 2\n", + "N = len(fruits)\n", + "df = pd.DataFrame({'fruit': fruits,\n", + " 'basket_id': np.arange(N),\n", + " 'count': np.random.randint(3, 15, size=N),\n", + " 'weight': np.random.uniform(0, 4, size=N)},\n", + " columns=['basket_id', 'fruit', 'count', 'weight'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fruit_cat = df['fruit'].astype('category')\n", + "fruit_cat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "c = fruit_cat.values\n", + "type(c)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "c.categories\n", + "c.codes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df['fruit'] = df['fruit'].astype('category')\n", + "df.fruit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])\n", + "my_categories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "categories = ['foo', 'bar', 'baz']\n", + "codes = [0, 1, 2, 0, 0, 1]\n", + "my_cats_2 = pd.Categorical.from_codes(codes, categories)\n", + "my_cats_2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ordered_cat = pd.Categorical.from_codes(codes, categories,\n", + " ordered=True)\n", + "ordered_cat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "my_cats_2.as_ordered()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Computations with Categoricals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.random.seed(12345)\n", + "draws = np.random.randn(1000)\n", + "draws[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bins = pd.qcut(draws, 4)\n", + "bins" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])\n", + "bins\n", + "bins.codes[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bins = pd.Series(bins, name='quartile')\n", + "results = (pd.Series(draws)\n", + " .groupby(bins)\n", + " .agg(['count', 'min', 'max'])\n", + " .reset_index())\n", + "results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "results['quartile']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Better performance with categoricals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "N = 10000000\n", + "draws = pd.Series(np.random.randn(N))\n", + "labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "categories = labels.astype('category')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "labels.memory_usage()\n", + "categories.memory_usage()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "%time _ = labels.astype('category')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Categorical Methods" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "s = pd.Series(['a', 'b', 'c', 'd'] * 2)\n", + "cat_s = s.astype('category')\n", + "cat_s" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cat_s.cat.codes\n", + "cat_s.cat.categories" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "actual_categories = ['a', 'b', 'c', 'd', 'e']\n", + "cat_s2 = cat_s.cat.set_categories(actual_categories)\n", + "cat_s2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cat_s.value_counts()\n", + "cat_s2.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cat_s3 = cat_s[cat_s.isin(['a', 'b'])]\n", + "cat_s3\n", + "cat_s3.cat.remove_unused_categories()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Creating dummy variables for modeling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.get_dummies(cat_s)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Advanced GroupBy Use" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Group Transforms and \"Unwrapped\" GroupBys" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,\n", + " 'value': np.arange(12.)})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g = df.groupby('key').value\n", + "g.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g.transform(lambda x: x.mean())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g.transform('mean')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g.transform(lambda x: x * 2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g.transform(lambda x: x.rank(ascending=False))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def normalize(x):\n", + " return (x - x.mean()) / x.std()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g.transform(normalize)\n", + "g.apply(normalize)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g.transform('mean')\n", + "normalized = (df['value'] - g.transform('mean')) / g.transform('std')\n", + "normalized" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Grouped Time Resampling" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "N = 15\n", + "times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)\n", + "df = pd.DataFrame({'time': times,\n", + " 'value': np.arange(N)})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df.set_index('time').resample('5min').count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df2 = pd.DataFrame({'time': times.repeat(3),\n", + " 'key': np.tile(['a', 'b', 'c'], N),\n", + " 'value': np.arange(N * 3.)})\n", + "df2[:7]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "time_key = pd.TimeGrouper('5min')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "resampled = (df2.set_index('time')\n", + " .groupby(['key', time_key])\n", + " .sum())\n", + "resampled\n", + "resampled.reset_index()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Techniques for Method Chaining" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "df = load_data()\n", + "df2 = df[df['col2'] < 0]\n", + "df2['col1_demeaned'] = df2['col1'] - df2['col1'].mean()\n", + "result = df2.groupby('key').col1_demeaned.std()" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Usual non-functional way\n", + "df2 = df.copy()\n", + "df2['k'] = v\n", + "\n", + "# Functional assign way\n", + "df2 = df.assign(k=v)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "result = (df2.assign(col1_demeaned=df2.col1 - df2.col2.mean())\n", + " .groupby('key')\n", + " .col1_demeaned.std())" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "df = load_data()\n", + "df2 = df[df['col2'] < 0]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "df = (load_data()\n", + " [lambda x: x['col2'] < 0])" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "result = (load_data()\n", + " [lambda x: x.col2 < 0]\n", + " .assign(col1_demeaned=lambda x: x.col1 - x.col1.mean())\n", + " .groupby('key')\n", + " .col1_demeaned.std())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### The pipe Method" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "a = f(df, arg1=v1)\n", + "b = g(a, v2, arg3=v3)\n", + "c = h(b, arg4=v4)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "result = (df.pipe(f, arg1=v1)\n", + " .pipe(g, v2, arg3=v3)\n", + " .pipe(h, arg4=v4))" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "g = df.groupby(['key1', 'key2'])\n", + "df['col1'] = df['col1'] - g.transform('mean')" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "def group_demean(df, by, cols):\n", + " result = df.copy()\n", + " g = df.groupby(by)\n", + " for c in cols:\n", + " result[c] = df[c] - g[c].transform('mean')\n", + " return result" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "result = (df[df.col1 < 0]\n", + " .pipe(group_demean, ['key1', 'key2'], ['col1']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch12" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Advanced pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Categorical Data" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Background and Motivation" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import numpy as np; import pandas as pd", - "values = pd.Series(['apple', 'orange', 'apple',", - " 'apple'] * 2)", - "values", - "pd.unique(values)", - "pd.value_counts(values)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "values = pd.Series([0, 1, 0, 0] * 2)", - "dim = pd.Series(['apple', 'orange'])", - "values", - "dim" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dim.take(values)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Categorical Type in pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fruits = ['apple', 'orange', 'apple', 'apple'] * 2", - "N = len(fruits)", - "df = pd.DataFrame({'fruit': fruits,", - " 'basket_id': np.arange(N),", - " 'count': np.random.randint(3, 15, size=N),", - " 'weight': np.random.uniform(0, 4, size=N)},", - " columns=['basket_id', 'fruit', 'count', 'weight'])", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fruit_cat = df['fruit'].astype('category')", - "fruit_cat" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "c = fruit_cat.values", - "type(c)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "c.categories", - "c.codes" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df['fruit'] = df['fruit'].astype('category')", - "df.fruit" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_categories = pd.Categorical(['foo', 'bar', 'baz', 'foo', 'bar'])", - "my_categories" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "categories = ['foo', 'bar', 'baz']", - "codes = [0, 1, 2, 0, 0, 1]", - "my_cats_2 = pd.Categorical.from_codes(codes, categories)", - "my_cats_2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ordered_cat = pd.Categorical.from_codes(codes, categories,", - " ordered=True)", - "ordered_cat" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "my_cats_2.as_ordered()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Computations with Categoricals" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.random.seed(12345)", - "draws = np.random.randn(1000)", - "draws[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = pd.qcut(draws, 4)", - "bins" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = pd.qcut(draws, 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])", - "bins", - "bins.codes[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = pd.Series(bins, name='quartile')", - "results = (pd.Series(draws)", - " .groupby(bins)", - " .agg(['count', 'min', 'max'])", - " .reset_index())", - "results" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results['quartile']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Better performance with categoricals" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "N = 10000000", - "draws = pd.Series(np.random.randn(N))", - "labels = pd.Series(['foo', 'bar', 'baz', 'qux'] * (N // 4))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "categories = labels.astype('category')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "labels.memory_usage()", - "categories.memory_usage()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%time _ = labels.astype('category')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Categorical Methods" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "s = pd.Series(['a', 'b', 'c', 'd'] * 2)", - "cat_s = s.astype('category')", - "cat_s" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cat_s.cat.codes", - "cat_s.cat.categories" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "actual_categories = ['a', 'b', 'c', 'd', 'e']", - "cat_s2 = cat_s.cat.set_categories(actual_categories)", - "cat_s2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cat_s.value_counts()", - "cat_s2.value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cat_s3 = cat_s[cat_s.isin(['a', 'b'])]", - "cat_s3", - "cat_s3.cat.remove_unused_categories()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Creating dummy variables for modeling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.get_dummies(cat_s)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Advanced GroupBy Use" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Group Transforms and \"Unwrapped\" GroupBys" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = pd.DataFrame({'key': ['a', 'b', 'c'] * 4,", - " 'value': np.arange(12.)})", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g = df.groupby('key').value", - "g.mean()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g.transform(lambda x: x.mean())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g.transform('mean')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g.transform(lambda x: x * 2)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g.transform(lambda x: x.rank(ascending=False))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def normalize(x):", - " return (x - x.mean()) / x.std()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g.transform(normalize)", - "g.apply(normalize)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g.transform('mean')", - "normalized = (df['value'] - g.transform('mean')) / g.transform('std')", - "normalized" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Grouped Time Resampling" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "N = 15", - "times = pd.date_range('2017-05-20 00:00', freq='1min', periods=N)", - "df = pd.DataFrame({'time': times,", - " 'value': np.arange(N)})", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df.set_index('time').resample('5min').count()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df2 = pd.DataFrame({'time': times.repeat(3),", - " 'key': np.tile(['a', 'b', 'c'], N),", - " 'value': np.arange(N * 3.)})", - "df2[:7]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "time_key = pd.TimeGrouper('5min')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "resampled = (df2.set_index('time')", - " .groupby(['key', time_key])", - " .sum())", - "resampled", - "resampled.reset_index()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Techniques for Method Chaining" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "df = load_data()", - "df2 = df[df['col2'] < 0]", - "df2['col1_demeaned'] = df2['col1'] - df2['col1'].mean()", - "result = df2.groupby('key').col1_demeaned.std()" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "# Usual non-functional way", - "df2 = df.copy()", - "df2['k'] = v", - "", - "# Functional assign way", - "df2 = df.assign(k=v)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "result = (df2.assign(col1_demeaned=df2.col1 - df2.col2.mean())", - " .groupby('key')", - " .col1_demeaned.std())" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "df = load_data()", - "df2 = df[df['col2'] < 0]" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "df = (load_data()", - " [lambda x: x['col2'] < 0])" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "result = (load_data()", - " [lambda x: x.col2 < 0]", - " .assign(col1_demeaned=lambda x: x.col1 - x.col1.mean())", - " .groupby('key')", - " .col1_demeaned.std())" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "The pipe Method" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "a = f(df, arg1=v1)", - "b = g(a, v2, arg3=v3)", - "c = h(b, arg4=v4)" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "result = (df.pipe(f, arg1=v1)", - " .pipe(g, v2, arg3=v3)", - " .pipe(h, arg4=v4))" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "g = df.groupby(['key1', 'key2'])", - "df['col1'] = df['col1'] - g.transform('mean')" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "def group_demean(df, by, cols):", - " result = df.copy()", - " g = df.groupby(by)", - " for c in cols:", - " result[c] = df[c] - g[c].transform('mean')", - " return result" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "result = (df[df.col1 < 0]", - " .pipe(group_demean, ['key1', 'key2'], ['col1']))" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch13.ipynb b/ch13.ipynb index d41ad070b..c94bac0f5 100644 --- a/ch13.ipynb +++ b/ch13.ipynb @@ -1,666 +1,824 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Introduction to Modeling Libraries " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "np.random.seed(12345)\n", + "import matplotlib.pyplot as plt\n", + "plt.rc('figure', figsize=(10, 6))\n", + "PREVIOUS_MAX_ROWS = pd.options.display.max_rows\n", + "pd.options.display.max_rows = 20\n", + "np.set_printoptions(precision=4, suppress=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Interfacing Between pandas and Model Code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "data = pd.DataFrame({\n", + " 'x0': [1, 2, 3, 4, 5],\n", + " 'x1': [0.01, -0.01, 0.25, -4.1, 0.],\n", + " 'y': [-1.5, 0., 3.6, 1.3, -2.]})\n", + "data\n", + "data.columns\n", + "data.values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df2 = pd.DataFrame(data.values, columns=['one', 'two', 'three'])\n", + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "model_cols = ['x0', 'x1']\n", + "data.loc[:, model_cols].values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'],\n", + " categories=['a', 'b'])\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dummies = pd.get_dummies(data.category, prefix='category')\n", + "data_with_dummies = data.drop('category', axis=1).join(dummies)\n", + "data_with_dummies" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Creating Model Descriptions with Patsy" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "y ~ x0 + x1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame({\n", + " 'x0': [1, 2, 3, 4, 5],\n", + " 'x1': [0.01, -0.01, 0.25, -4.1, 0.],\n", + " 'y': [-1.5, 0., 3.6, 1.3, -2.]})\n", + "data\n", + "import patsy\n", + "y, X = patsy.dmatrices('y ~ x0 + x1', data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "y\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "np.asarray(y)\n", + "np.asarray(X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "patsy.dmatrices('y ~ x0 + x1 + 0', data)[1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "coef, resid, _, _ = np.linalg.lstsq(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "coef\n", + "coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)\n", + "coef" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Data Transformations in Patsy Formulas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "new_data = pd.DataFrame({\n", + " 'x0': [6, 7, 8, 9],\n", + " 'x1': [3.1, -0.5, 0, 2.3],\n", + " 'y': [1, 2, 3, 4]})\n", + "new_X = patsy.build_design_matrices([X.design_info], new_data)\n", + "new_X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "y, X = patsy.dmatrices('y ~ I(x0 + x1)', data)\n", + "X" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Categorical Data and Patsy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame({\n", + " 'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],\n", + " 'key2': [0, 1, 0, 1, 0, 1, 0, 0],\n", + " 'v1': [1, 2, 3, 4, 5, 6, 7, 8],\n", + " 'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]\n", + "})\n", + "y, X = patsy.dmatrices('v2 ~ key1', data)\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "y, X = patsy.dmatrices('v2 ~ key1 + 0', data)\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "y, X = patsy.dmatrices('v2 ~ C(key2)', data)\n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data['key2'] = data['key2'].map({0: 'zero', 1: 'one'})\n", + "data\n", + "y, X = patsy.dmatrices('v2 ~ key1 + key2', data)\n", + "X\n", + "y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)\n", + "X" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Introduction to statsmodels" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Estimating Linear Models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import statsmodels.api as sm\n", + "import statsmodels.formula.api as smf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def dnorm(mean, variance, size=1):\n", + " if isinstance(size, int):\n", + " size = size,\n", + " return mean + np.sqrt(variance) * np.random.randn(*size)\n", + "\n", + "# For reproducibility\n", + "np.random.seed(12345)\n", + "\n", + "N = 100\n", + "X = np.c_[dnorm(0, 0.4, size=N),\n", + " dnorm(0, 0.6, size=N),\n", + " dnorm(0, 0.2, size=N)]\n", + "eps = dnorm(0, 0.1, size=N)\n", + "beta = [0.1, 0.3, 0.5]\n", + "\n", + "y = np.dot(X, beta) + eps" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X[:5]\n", + "y[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "X_model = sm.add_constant(X)\n", + "X_model[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "model = sm.OLS(y, X)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "results = model.fit()\n", + "results.params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "print(results.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])\n", + "data['y'] = y\n", + "data[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()\n", + "results.params\n", + "results.tvalues" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "results.predict(data[:5])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Estimating Time Series Processes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "init_x = 4\n", + "\n", + "import random\n", + "values = [init_x, init_x]\n", + "N = 1000\n", + "\n", + "b0 = 0.8\n", + "b1 = -0.4\n", + "noise = dnorm(0, 0.1, N)\n", + "for i in range(N):\n", + " new_x = values[-1] * b0 + values[-2] * b1 + noise[i]\n", + " values.append(new_x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "MAXLAGS = 5\n", + "model = sm.tsa.AR(values)\n", + "results = model.fit(MAXLAGS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "results.params" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Introduction to scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "train = pd.read_csv('datasets/titanic/train.csv')\n", + "test = pd.read_csv('datasets/titanic/test.csv')\n", + "train[:4]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "train.isnull().sum()\n", + "test.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "impute_value = train['Age'].median()\n", + "train['Age'] = train['Age'].fillna(impute_value)\n", + "test['Age'] = test['Age'].fillna(impute_value)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "train['IsFemale'] = (train['Sex'] == 'female').astype(int)\n", + "test['IsFemale'] = (test['Sex'] == 'female').astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "predictors = ['Pclass', 'IsFemale', 'Age']\n", + "X_train = train[predictors].values\n", + "X_test = test[predictors].values\n", + "y_train = train['Survived'].values\n", + "X_train[:5]\n", + "y_train[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegression\n", + "model = LogisticRegression()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "model.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "y_predict = model.predict(X_test)\n", + "y_predict[:10]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "(y_true == y_predict).mean()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from sklearn.linear_model import LogisticRegressionCV\n", + "model_cv = LogisticRegressionCV(10)\n", + "model_cv.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from sklearn.model_selection import cross_val_score\n", + "model = LogisticRegression(C=10)\n", + "scores = cross_val_score(model, X_train, y_train, cv=4)\n", + "scores" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Continuing Your Education" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" + ] + } + ], "metadata": { - "name": "generated_ch13" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Introduction to Modeling Libraries " - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "import numpy as np", - "import pandas as pd", - "np.random.seed(12345)", - "import matplotlib.pyplot as plt", - "plt.rc('figure', figsize=(10, 6))", - "PREVIOUS_MAX_ROWS = pd.options.display.max_rows", - "pd.options.display.max_rows = 20", - "np.set_printoptions(precision=4, suppress=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Interfacing Between pandas and Model Code" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas as pd", - "import numpy as np", - "data = pd.DataFrame({", - " 'x0': [1, 2, 3, 4, 5],", - " 'x1': [0.01, -0.01, 0.25, -4.1, 0.],", - " 'y': [-1.5, 0., 3.6, 1.3, -2.]})", - "data", - "data.columns", - "data.values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df2 = pd.DataFrame(data.values, columns=['one', 'two', 'three'])", - "df2" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "model_cols = ['x0', 'x1']", - "data.loc[:, model_cols].values" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['category'] = pd.Categorical(['a', 'b', 'a', 'a', 'b'],", - " categories=['a', 'b'])", - "data" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dummies = pd.get_dummies(data.category, prefix='category')", - "data_with_dummies = data.drop('category', axis=1).join(dummies)", - "data_with_dummies" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Creating Model Descriptions with Patsy" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "y ~ x0 + x1" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame({", - " 'x0': [1, 2, 3, 4, 5],", - " 'x1': [0.01, -0.01, 0.25, -4.1, 0.],", - " 'y': [-1.5, 0., 3.6, 1.3, -2.]})", - "data", - "import patsy", - "y, X = patsy.dmatrices('y ~ x0 + x1', data)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "y", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "np.asarray(y)", - "np.asarray(X)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "patsy.dmatrices('y ~ x0 + x1 + 0', data)[1]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "coef, resid, _, _ = np.linalg.lstsq(X, y)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "coef", - "coef = pd.Series(coef.squeeze(), index=X.design_info.column_names)", - "coef" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Data Transformations in Patsy Formulas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "y, X = patsy.dmatrices('y ~ x0 + np.log(np.abs(x1) + 1)', data)", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "y, X = patsy.dmatrices('y ~ standardize(x0) + center(x1)', data)", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "new_data = pd.DataFrame({", - " 'x0': [6, 7, 8, 9],", - " 'x1': [3.1, -0.5, 0, 2.3],", - " 'y': [1, 2, 3, 4]})", - "new_X = patsy.build_design_matrices([X.design_info], new_data)", - "new_X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "y, X = patsy.dmatrices('y ~ I(x0 + x1)', data)", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Categorical Data and Patsy" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame({", - " 'key1': ['a', 'a', 'b', 'b', 'a', 'b', 'a', 'b'],", - " 'key2': [0, 1, 0, 1, 0, 1, 0, 0],", - " 'v1': [1, 2, 3, 4, 5, 6, 7, 8],", - " 'v2': [-1, 0, 2.5, -0.5, 4.0, -1.2, 0.2, -1.7]", - "})", - "y, X = patsy.dmatrices('v2 ~ key1', data)", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "y, X = patsy.dmatrices('v2 ~ key1 + 0', data)", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "y, X = patsy.dmatrices('v2 ~ C(key2)', data)", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data['key2'] = data['key2'].map({0: 'zero', 1: 'one'})", - "data", - "y, X = patsy.dmatrices('v2 ~ key1 + key2', data)", - "X", - "y, X = patsy.dmatrices('v2 ~ key1 + key2 + key1:key2', data)", - "X" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Introduction to statsmodels" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Estimating Linear Models" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import statsmodels.api as sm", - "import statsmodels.formula.api as smf" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def dnorm(mean, variance, size=1):", - " if isinstance(size, int):", - " size = size,", - " return mean + np.sqrt(variance) * np.random.randn(*size)", - "", - "# For reproducibility", - "np.random.seed(12345)", - "", - "N = 100", - "X = np.c_[dnorm(0, 0.4, size=N),", - " dnorm(0, 0.6, size=N),", - " dnorm(0, 0.2, size=N)]", - "eps = dnorm(0, 0.1, size=N)", - "beta = [0.1, 0.3, 0.5]", - "", - "y = np.dot(X, beta) + eps" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "X[:5]", - "y[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "X_model = sm.add_constant(X)", - "X_model[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "model = sm.OLS(y, X)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results = model.fit()", - "results.params" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print(results.summary())" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.DataFrame(X, columns=['col0', 'col1', 'col2'])", - "data['y'] = y", - "data[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results = smf.ols('y ~ col0 + col1 + col2', data=data).fit()", - "results.params", - "results.tvalues" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results.predict(data[:5])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Estimating Time Series Processes" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "init_x = 4", - "", - "import random", - "values = [init_x, init_x]", - "N = 1000", - "", - "b0 = 0.8", - "b1 = -0.4", - "noise = dnorm(0, 0.1, N)", - "for i in range(N):", - " new_x = values[-1] * b0 + values[-2] * b1 + noise[i]", - " values.append(new_x)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "MAXLAGS = 5", - "model = sm.tsa.AR(values)", - "results = model.fit(MAXLAGS)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results.params" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Introduction to scikit-learn" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "train = pd.read_csv('datasets/titanic/train.csv')", - "test = pd.read_csv('datasets/titanic/test.csv')", - "train[:4]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "train.isnull().sum()", - "test.isnull().sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "impute_value = train['Age'].median()", - "train['Age'] = train['Age'].fillna(impute_value)", - "test['Age'] = test['Age'].fillna(impute_value)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "train['IsFemale'] = (train['Sex'] == 'female').astype(int)", - "test['IsFemale'] = (test['Sex'] == 'female').astype(int)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "predictors = ['Pclass', 'IsFemale', 'Age']", - "X_train = train[predictors].values", - "X_test = test[predictors].values", - "y_train = train['Survived'].values", - "X_train[:5]", - "y_train[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from sklearn.linear_model import LogisticRegression", - "model = LogisticRegression()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "model.fit(X_train, y_train)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "y_predict = model.predict(X_test)", - "y_predict[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "(y_true == y_predict).mean()" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from sklearn.linear_model import LogisticRegressionCV", - "model_cv = LogisticRegressionCV(10)", - "model_cv.fit(X_train, y_train)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from sklearn.model_selection import cross_val_score", - "model = LogisticRegression(C=10)", - "scores = cross_val_score(model, X_train, y_train, cv=4)", - "scores" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Continuing Your Education" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.options.display.max_rows = PREVIOUS_MAX_ROWS" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/ch14.ipynb b/ch14.ipynb index 18c38ec93..7379b7409 100644 --- a/ch14.ipynb +++ b/ch14.ipynb @@ -1,1659 +1,2050 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "# Data Analysis Examples" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## 1.USA.gov Data from Bitly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from numpy.random import randn\n", + "import numpy as np\n", + "np.random.seed(123)\n", + "import os\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "plt.rc('figure', figsize=(10, 6))\n", + "np.set_printoptions(precision=4)\n", + "pd.options.display.max_rows = 20" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [5]: path = 'datasets/bitly_usagov/example.txt'\n", + "\n", + "In [6]: open(path).readline()\n", + "Out[6]: '{ \"a\": \"Mozilla\\\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\\\/535.11\n", + "(KHTML, like Gecko) Chrome\\\\/17.0.963.78 Safari\\\\/535.11\", \"c\": \"US\", \"nk\": 1,\n", + "\"tz\": \"America\\\\/New_York\", \"gr\": \"MA\", \"g\": \"A6qOVH\", \"h\": \"wfLQtf\", \"l\":\n", + "\"orofrog\", \"al\": \"en-US,en;q=0.8\", \"hh\": \"1.usa.gov\", \"r\":\n", + "\"http:\\\\/\\\\/www.facebook.com\\\\/l\\\\/7AQEFzjSi\\\\/1.usa.gov\\\\/wfLQtf\", \"u\":\n", + "\"http:\\\\/\\\\/www.ncbi.nlm.nih.gov\\\\/pubmed\\\\/22415991\", \"t\": 1331923247, \"hc\":\n", + "1331822918, \"cy\": \"Danvers\", \"ll\": [ 42.576698, -70.954903 ] }\\n'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import json\n", + "path = 'datasets/bitly_usagov/example.txt'\n", + "records = [json.loads(line) for line in open(path)]" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [18]: records[0]\n", + "Out[18]:\n", + "{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko)\n", + "Chrome/17.0.963.78 Safari/535.11',\n", + " 'al': 'en-US,en;q=0.8',\n", + " 'c': 'US',\n", + " 'cy': 'Danvers',\n", + " 'g': 'A6qOVH',\n", + " 'gr': 'MA',\n", + " 'h': 'wfLQtf',\n", + " 'hc': 1331822918,\n", + " 'hh': '1.usa.gov',\n", + " 'l': 'orofrog',\n", + " 'll': [42.576698, -70.954903],\n", + " 'nk': 1,\n", + " 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',\n", + " 't': 1331923247,\n", + " 'tz': 'America/New_York',\n", + " 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Counting Time Zones in Pure Python" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "time_zones = [rec['tz'] for rec in records]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "time_zones = [rec['tz'] for rec in records if 'tz' in rec]\n", + "time_zones[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def get_counts(sequence):\n", + " counts = {}\n", + " for x in sequence:\n", + " if x in counts:\n", + " counts[x] += 1\n", + " else:\n", + " counts[x] = 1\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "def get_counts2(sequence):\n", + " counts = defaultdict(int) # values will initialize to 0\n", + " for x in sequence:\n", + " counts[x] += 1\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "counts = get_counts(time_zones)\n", + "counts['America/New_York']\n", + "len(time_zones)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def top_counts(count_dict, n=10):\n", + " value_key_pairs = [(count, tz) for tz, count in count_dict.items()]\n", + " value_key_pairs.sort()\n", + " return value_key_pairs[-n:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "top_counts(counts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "from collections import Counter\n", + "counts = Counter(time_zones)\n", + "counts.most_common(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Counting Time Zones with pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "frame = pd.DataFrame(records)\n", + "frame.info()\n", + "frame['tz'][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "tz_counts = frame['tz'].value_counts()\n", + "tz_counts[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "clean_tz = frame['tz'].fillna('Missing')\n", + "clean_tz[clean_tz == ''] = 'Unknown'\n", + "tz_counts = clean_tz.value_counts()\n", + "tz_counts[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure(figsize=(10, 4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import seaborn as sns\n", + "subset = tz_counts[:10]\n", + "sns.barplot(y=subset.index, x=subset.values)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "frame['a'][1]\n", + "frame['a'][50]\n", + "frame['a'][51][:50] # long line" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "results = pd.Series([x.split()[0] for x in frame.a.dropna()])\n", + "results[:5]\n", + "results.value_counts()[:8]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cframe = frame[frame.a.notnull()]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cframe = cframe.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "cframe['os'] = np.where(cframe['a'].str.contains('Windows'),\n", + " 'Windows', 'Not Windows')\n", + "cframe['os'][:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "by_tz_os = cframe.groupby(['tz', 'os'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "agg_counts = by_tz_os.size().unstack().fillna(0)\n", + "agg_counts[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Use to sort in ascending order\n", + "indexer = agg_counts.sum(1).argsort()\n", + "indexer[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "count_subset = agg_counts.take(indexer[-10:])\n", + "count_subset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "agg_counts.sum(1).nlargest(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Rearrange the data for plotting\n", + "count_subset = count_subset.stack()\n", + "count_subset.name = 'total'\n", + "count_subset = count_subset.reset_index()\n", + "count_subset[:10]\n", + "sns.barplot(x='total', y='tz', hue='os', data=count_subset)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def norm_total(group):\n", + " group['normed_total'] = group.total / group.total.sum()\n", + " return group\n", + "\n", + "results = count_subset.groupby('tz').apply(norm_total)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sns.barplot(x='normed_total', y='tz', hue='os', data=results)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "g = count_subset.groupby('tz')\n", + "results2 = count_subset.total / g.total.transform('sum')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## MovieLens 1M Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "# Make display smaller\n", + "pd.options.display.max_rows = 10\n", + "\n", + "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']\n", + "users = pd.read_table('datasets/movielens/users.dat', sep='::',\n", + " header=None, names=unames)\n", + "\n", + "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']\n", + "ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',\n", + " header=None, names=rnames)\n", + "\n", + "mnames = ['movie_id', 'title', 'genres']\n", + "movies = pd.read_table('datasets/movielens/movies.dat', sep='::',\n", + " header=None, names=mnames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "users[:5]\n", + "ratings[:5]\n", + "movies[:5]\n", + "ratings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "data = pd.merge(pd.merge(ratings, users), movies)\n", + "data\n", + "data.iloc[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mean_ratings = data.pivot_table('rating', index='title',\n", + " columns='gender', aggfunc='mean')\n", + "mean_ratings[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ratings_by_title = data.groupby('title').size()\n", + "ratings_by_title[:10]\n", + "active_titles = ratings_by_title.index[ratings_by_title >= 250]\n", + "active_titles" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Select rows on the index\n", + "mean_ratings = mean_ratings.loc[active_titles]\n", + "mean_ratings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':\n", + " 'Seven Samurai (Shichinin no samurai) (1954)'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)\n", + "top_female_ratings[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Measuring Rating Disagreement" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "sorted_by_diff = mean_ratings.sort_values(by='diff')\n", + "sorted_by_diff[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Reverse order of rows, take first 10 rows\n", + "sorted_by_diff[::-1][:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Standard deviation of rating grouped by title\n", + "rating_std_by_title = data.groupby('title')['rating'].std()\n", + "# Filter down to active_titles\n", + "rating_std_by_title = rating_std_by_title.loc[active_titles]\n", + "# Order Series by value in descending order\n", + "rating_std_by_title.sort_values(ascending=False)[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## US Baby Names 1880–2010" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "In [4]: names.head(10)\n", + "Out[4]:\n", + " name sex births year\n", + "0 Mary F 7065 1880\n", + "1 Anna F 2604 1880\n", + "2 Emma F 2003 1880\n", + "3 Elizabeth F 1939 1880\n", + "4 Minnie F 1746 1880\n", + "5 Margaret F 1578 1880\n", + "6 Ida F 1472 1880\n", + "7 Alice F 1414 1880\n", + "8 Bertha F 1320 1880\n", + "9 Sarah F 1288 1880" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "!head -n 10 datasets/babynames/yob1880.txt" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "names1880 = pd.read_csv('datasets/babynames/yob1880.txt',\n", + " names=['name', 'sex', 'births'])\n", + "names1880" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names1880.groupby('sex').births.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "years = range(1880, 2011)\n", + "\n", + "pieces = []\n", + "columns = ['name', 'sex', 'births']\n", + "\n", + "for year in years:\n", + " path = 'datasets/babynames/yob%d.txt' % year\n", + " frame = pd.read_csv(path, names=columns)\n", + "\n", + " frame['year'] = year\n", + " pieces.append(frame)\n", + "\n", + "# Concatenate everything into a single DataFrame\n", + "names = pd.concat(pieces, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "total_births = names.pivot_table('births', index='year',\n", + " columns='sex', aggfunc=sum)\n", + "total_births.tail()\n", + "total_births.plot(title='Total births by sex and year')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def add_prop(group):\n", + " group['prop'] = group.births / group.births.sum()\n", + " return group\n", + "names = names.groupby(['year', 'sex']).apply(add_prop)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "names.groupby(['year', 'sex']).prop.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def get_top1000(group):\n", + " return group.sort_values(by='births', ascending=False)[:1000]\n", + "grouped = names.groupby(['year', 'sex'])\n", + "top1000 = grouped.apply(get_top1000)\n", + "# Drop the group index, not needed\n", + "top1000.reset_index(inplace=True, drop=True)" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "pieces = []\n", + "for year, group in names.groupby(['year', 'sex']):\n", + " pieces.append(group.sort_values(by='births', ascending=False)[:1000])\n", + "top1000 = pd.concat(pieces, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "top1000" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Analyzing Naming Trends" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "boys = top1000[top1000.sex == 'M']\n", + "girls = top1000[top1000.sex == 'F']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "total_births = top1000.pivot_table('births', index='year',\n", + " columns='name',\n", + " aggfunc=sum)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "total_births.info()\n", + "subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]\n", + "subset.plot(subplots=True, figsize=(12, 10), grid=False,\n", + " title=\"Number of births per year\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Measuring the increase in naming diversity" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "table = top1000.pivot_table('prop', index='year',\n", + " columns='sex', aggfunc=sum)\n", + "table.plot(title='Sum of table1000.prop by year and sex',\n", + " yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = boys[boys.year == 2010]\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()\n", + "prop_cumsum[:10]\n", + "prop_cumsum.values.searchsorted(0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "df = boys[boys.year == 1900]\n", + "in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()\n", + "in1900.values.searchsorted(0.5) + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def get_quantile_count(group, q=0.5):\n", + " group = group.sort_values(by='prop', ascending=False)\n", + " return group.prop.cumsum().values.searchsorted(q) + 1\n", + "\n", + "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)\n", + "diversity = diversity.unstack('sex')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig = plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "diversity.head()\n", + "diversity.plot(title=\"Number of popular names in top 50%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### The “last letter” revolution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# extract last letter from name column\n", + "get_last_letter = lambda x: x[-1]\n", + "last_letters = names.name.map(get_last_letter)\n", + "last_letters.name = 'last_letter'\n", + "\n", + "table = names.pivot_table('births', index=last_letters,\n", + " columns=['sex', 'year'], aggfunc=sum)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "subtable = table.reindex(columns=[1910, 1960, 2010], level='year')\n", + "subtable.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "subtable.sum()\n", + "letter_prop = subtable / subtable.sum()\n", + "letter_prop" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig, axes = plt.subplots(2, 1, figsize=(10, 8))\n", + "letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')\n", + "letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',\n", + " legend=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.subplots_adjust(hspace=0.25)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "letter_prop = table / table.sum()\n", + "dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T\n", + "dny_ts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.close('all')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig = plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "dny_ts.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "#### Boy names that became girl names (and vice versa)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "all_names = pd.Series(top1000.name.unique())\n", + "lesley_like = all_names[all_names.str.lower().str.contains('lesl')]\n", + "lesley_like" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "filtered = top1000[top1000.name.isin(lesley_like)]\n", + "filtered.groupby('name').births.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "table = filtered.pivot_table('births', index='year',\n", + " columns='sex', aggfunc='sum')\n", + "table = table.div(table.sum(1), axis=0)\n", + "table.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig = plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "table.plot(style={'M': 'k-', 'F': 'k--'})" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## USDA Food Database" + ] + }, + { + "cell_type": "raw", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "{\n", + " \"id\": 21441,\n", + " \"description\": \"KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,\n", + "Wing, meat and skin with breading\",\n", + " \"tags\": [\"KFC\"],\n", + " \"manufacturer\": \"Kentucky Fried Chicken\",\n", + " \"group\": \"Fast Foods\",\n", + " \"portions\": [\n", + " {\n", + " \"amount\": 1,\n", + " \"unit\": \"wing, with skin\",\n", + " \"grams\": 68.0\n", + " },\n", + "\n", + " ...\n", + " ],\n", + " \"nutrients\": [\n", + " {\n", + " \"value\": 20.8,\n", + " \"units\": \"g\",\n", + " \"description\": \"Protein\",\n", + " \"group\": \"Composition\"\n", + " },\n", + "\n", + " ...\n", + " ]\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "import json\n", + "db = json.load(open('datasets/usda_food/database.json'))\n", + "len(db)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "db[0].keys()\n", + "db[0]['nutrients'][0]\n", + "nutrients = pd.DataFrame(db[0]['nutrients'])\n", + "nutrients[:7]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "info_keys = ['description', 'group', 'id', 'manufacturer']\n", + "info = pd.DataFrame(db, columns=info_keys)\n", + "info[:5]\n", + "info.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "pd.value_counts(info.group)[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "nutrients = []\n", + "\n", + "for rec in db:\n", + " fnuts = pd.DataFrame(rec['nutrients'])\n", + " fnuts['id'] = rec['id']\n", + " nutrients.append(fnuts)\n", + "\n", + "nutrients = pd.concat(nutrients, ignore_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "nutrients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "nutrients.duplicated().sum() # number of duplicates\n", + "nutrients = nutrients.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "col_mapping = {'description' : 'food',\n", + " 'group' : 'fgroup'}\n", + "info = info.rename(columns=col_mapping, copy=False)\n", + "info.info()\n", + "col_mapping = {'description' : 'nutrient',\n", + " 'group' : 'nutgroup'}\n", + "nutrients = nutrients.rename(columns=col_mapping, copy=False)\n", + "nutrients" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "ndata = pd.merge(nutrients, info, on='id', how='outer')\n", + "ndata.info()\n", + "ndata.iloc[30000]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fig = plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)\n", + "result['Zinc, Zn'].sort_values().plot(kind='barh')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])\n", + "\n", + "get_maximum = lambda x: x.loc[x.value.idxmax()]\n", + "get_minimum = lambda x: x.loc[x.value.idxmin()]\n", + "\n", + "max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]\n", + "\n", + "# make the food a little smaller\n", + "max_foods.food = max_foods.food.str[:50]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "max_foods.loc['Amino Acids']['food']" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## 2012 Federal Election Commission Database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fec = pd.read_csv('datasets/fec/P00000001-ALL.csv')\n", + "fec.info()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fec.iloc[123456]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "unique_cands = fec.cand_nm.unique()\n", + "unique_cands\n", + "unique_cands[2]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "parties = {'Bachmann, Michelle': 'Republican',\n", + " 'Cain, Herman': 'Republican',\n", + " 'Gingrich, Newt': 'Republican',\n", + " 'Huntsman, Jon': 'Republican',\n", + " 'Johnson, Gary Earl': 'Republican',\n", + " 'McCotter, Thaddeus G': 'Republican',\n", + " 'Obama, Barack': 'Democrat',\n", + " 'Paul, Ron': 'Republican',\n", + " 'Pawlenty, Timothy': 'Republican',\n", + " 'Perry, Rick': 'Republican',\n", + " \"Roemer, Charles E. 'Buddy' III\": 'Republican',\n", + " 'Romney, Mitt': 'Republican',\n", + " 'Santorum, Rick': 'Republican'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fec.cand_nm[123456:123461]\n", + "fec.cand_nm[123456:123461].map(parties)\n", + "# Add it as a column\n", + "fec['party'] = fec.cand_nm.map(parties)\n", + "fec['party'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "(fec.contb_receipt_amt > 0).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fec = fec[fec.contb_receipt_amt > 0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Donation Statistics by Occupation and Employer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "fec.contbr_occupation.value_counts()[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "occ_mapping = {\n", + " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", + " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", + " 'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',\n", + " 'C.E.O.': 'CEO'\n", + "}\n", + "\n", + "# If no mapping provided, return x\n", + "f = lambda x: occ_mapping.get(x, x)\n", + "fec.contbr_occupation = fec.contbr_occupation.map(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "emp_mapping = {\n", + " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',\n", + " 'INFORMATION REQUESTED' : 'NOT PROVIDED',\n", + " 'SELF' : 'SELF-EMPLOYED',\n", + " 'SELF EMPLOYED' : 'SELF-EMPLOYED',\n", + "}\n", + "\n", + "# If no mapping provided, return x\n", + "f = lambda x: emp_mapping.get(x, x)\n", + "fec.contbr_employer = fec.contbr_employer.map(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "by_occupation = fec.pivot_table('contb_receipt_amt',\n", + " index='contbr_occupation',\n", + " columns='party', aggfunc='sum')\n", + "over_2mm = by_occupation[by_occupation.sum(1) > 2000000]\n", + "over_2mm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "over_2mm.plot(kind='barh')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "def get_top_amounts(group, key, n=5):\n", + " totals = group.groupby(key)['contb_receipt_amt'].sum()\n", + " return totals.nlargest(n)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped = fec_mrbo.groupby('cand_nm')\n", + "grouped.apply(get_top_amounts, 'contbr_occupation', n=7)\n", + "grouped.apply(get_top_amounts, 'contbr_employer', n=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Bucketing Donation Amounts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bins = np.array([0, 1, 10, 100, 1000, 10000,\n", + " 100000, 1000000, 10000000])\n", + "labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)\n", + "labels" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped = fec_mrbo.groupby(['cand_nm', labels])\n", + "grouped.size().unstack(0)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "plt.figure()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)\n", + "normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)\n", + "normed_sums\n", + "normed_sums[:-2].plot(kind='barh')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "### Donation Statistics by State" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])\n", + "totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)\n", + "totals = totals[totals.sum(1) > 100000]\n", + "totals[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "percent = totals.div(totals.sum(1), axis=0)\n", + "percent[:10]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": true, + "editable": true + }, + "source": [ + "## Conclusion" + ] + } + ], "metadata": { - "name": "generated_ch14" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Data Analysis Examples" - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "1.USA.gov Data from Bitly" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pushd book-materials", - "from numpy.random import randn", - "import numpy as np", - "np.random.seed(123)", - "import os", - "import matplotlib.pyplot as plt", - "import pandas as pd", - "plt.rc('figure', figsize=(10, 6))", - "np.set_printoptions(precision=4)", - "pd.options.display.max_rows = 20" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [5]: path = 'datasets/bitly_usagov/example.txt'", - "", - "In [6]: open(path).readline()", - "Out[6]: '{ \"a\": \"Mozilla\\\\/5.0 (Windows NT 6.1; WOW64) AppleWebKit\\\\/535.11", - "(KHTML, like Gecko) Chrome\\\\/17.0.963.78 Safari\\\\/535.11\", \"c\": \"US\", \"nk\": 1,", - "\"tz\": \"America\\\\/New_York\", \"gr\": \"MA\", \"g\": \"A6qOVH\", \"h\": \"wfLQtf\", \"l\":", - "\"orofrog\", \"al\": \"en-US,en;q=0.8\", \"hh\": \"1.usa.gov\", \"r\":", - "\"http:\\\\/\\\\/www.facebook.com\\\\/l\\\\/7AQEFzjSi\\\\/1.usa.gov\\\\/wfLQtf\", \"u\":", - "\"http:\\\\/\\\\/www.ncbi.nlm.nih.gov\\\\/pubmed\\\\/22415991\", \"t\": 1331923247, \"hc\":", - "1331822918, \"cy\": \"Danvers\", \"ll\": [ 42.576698, -70.954903 ] }\\n'" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import json", - "path = 'datasets/bitly_usagov/example.txt'", - "records = [json.loads(line) for line in open(path)]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [18]: records[0]", - "Out[18]:", - "{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko)", - "Chrome/17.0.963.78 Safari/535.11',", - " 'al': 'en-US,en;q=0.8',", - " 'c': 'US',", - " 'cy': 'Danvers',", - " 'g': 'A6qOVH',", - " 'gr': 'MA',", - " 'h': 'wfLQtf',", - " 'hc': 1331822918,", - " 'hh': '1.usa.gov',", - " 'l': 'orofrog',", - " 'll': [42.576698, -70.954903],", - " 'nk': 1,", - " 'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',", - " 't': 1331923247,", - " 'tz': 'America/New_York',", - " 'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}" - ] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Counting Time Zones in Pure Python" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "time_zones = [rec['tz'] for rec in records]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "time_zones = [rec['tz'] for rec in records if 'tz' in rec]", - "time_zones[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_counts(sequence):", - " counts = {}", - " for x in sequence:", - " if x in counts:", - " counts[x] += 1", - " else:", - " counts[x] = 1", - " return counts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from collections import defaultdict", - "", - "def get_counts2(sequence):", - " counts = defaultdict(int) # values will initialize to 0", - " for x in sequence:", - " counts[x] += 1", - " return counts" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "counts = get_counts(time_zones)", - "counts['America/New_York']", - "len(time_zones)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def top_counts(count_dict, n=10):", - " value_key_pairs = [(count, tz) for tz, count in count_dict.items()]", - " value_key_pairs.sort()", - " return value_key_pairs[-n:]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "top_counts(counts)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from collections import Counter", - "counts = Counter(time_zones)", - "counts.most_common(10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Counting Time Zones with pandas" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas as pd", - "frame = pd.DataFrame(records)", - "frame.info()", - "frame['tz'][:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "tz_counts = frame['tz'].value_counts()", - "tz_counts[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "clean_tz = frame['tz'].fillna('Missing')", - "clean_tz[clean_tz == ''] = 'Unknown'", - "tz_counts = clean_tz.value_counts()", - "tz_counts[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure(figsize=(10, 4))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import seaborn as sns", - "subset = tz_counts[:10]", - "sns.barplot(y=subset.index, x=subset.values)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "frame['a'][1]", - "frame['a'][50]", - "frame['a'][51][:50] # long line" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "results = pd.Series([x.split()[0] for x in frame.a.dropna()])", - "results[:5]", - "results.value_counts()[:8]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cframe = frame[frame.a.notnull()]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cframe = cframe.copy()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "cframe['os'] = np.where(cframe['a'].str.contains('Windows'),", - " 'Windows', 'Not Windows')", - "cframe['os'][:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_tz_os = cframe.groupby(['tz', 'os'])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "agg_counts = by_tz_os.size().unstack().fillna(0)", - "agg_counts[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Use to sort in ascending order", - "indexer = agg_counts.sum(1).argsort()", - "indexer[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "count_subset = agg_counts.take(indexer[-10:])", - "count_subset" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "agg_counts.sum(1).nlargest(10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Rearrange the data for plotting", - "count_subset = count_subset.stack()", - "count_subset.name = 'total'", - "count_subset = count_subset.reset_index()", - "count_subset[:10]", - "sns.barplot(x='total', y='tz', hue='os', data=count_subset)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def norm_total(group):", - " group['normed_total'] = group.total / group.total.sum()", - " return group", - "", - "results = count_subset.groupby('tz').apply(norm_total)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sns.barplot(x='normed_total', y='tz', hue='os', data=results)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "g = count_subset.groupby('tz')", - "results2 = count_subset.total / g.total.transform('sum')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "MovieLens 1M Dataset" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas as pd", - "", - "# Make display smaller", - "pd.options.display.max_rows = 10", - "", - "unames = ['user_id', 'gender', 'age', 'occupation', 'zip']", - "users = pd.read_table('datasets/movielens/users.dat', sep='::',", - " header=None, names=unames)", - "", - "rnames = ['user_id', 'movie_id', 'rating', 'timestamp']", - "ratings = pd.read_table('datasets/movielens/ratings.dat', sep='::',", - " header=None, names=rnames)", - "", - "mnames = ['movie_id', 'title', 'genres']", - "movies = pd.read_table('datasets/movielens/movies.dat', sep='::',", - " header=None, names=mnames)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "users[:5]", - "ratings[:5]", - "movies[:5]", - "ratings" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "data = pd.merge(pd.merge(ratings, users), movies)", - "data", - "data.iloc[0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mean_ratings = data.pivot_table('rating', index='title',", - " columns='gender', aggfunc='mean')", - "mean_ratings[:5]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ratings_by_title = data.groupby('title').size()", - "ratings_by_title[:10]", - "active_titles = ratings_by_title.index[ratings_by_title >= 250]", - "active_titles" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Select rows on the index", - "mean_ratings = mean_ratings.loc[active_titles]", - "mean_ratings" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mean_ratings = mean_ratings.rename(index={'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)':", - " 'Seven Samurai (Shichinin no samurai) (1954)'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "top_female_ratings = mean_ratings.sort_values(by='F', ascending=False)", - "top_female_ratings[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Measuring Rating Disagreement" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "mean_ratings['diff'] = mean_ratings['M'] - mean_ratings['F']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "sorted_by_diff = mean_ratings.sort_values(by='diff')", - "sorted_by_diff[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Reverse order of rows, take first 10 rows", - "sorted_by_diff[::-1][:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# Standard deviation of rating grouped by title", - "rating_std_by_title = data.groupby('title')['rating'].std()", - "# Filter down to active_titles", - "rating_std_by_title = rating_std_by_title.loc[active_titles]", - "# Order Series by value in descending order", - "rating_std_by_title.sort_values(ascending=False)[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "US Baby Names 1880\u20132010" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "In [4]: names.head(10)", - "Out[4]:", - " name sex births year", - "0 Mary F 7065 1880", - "1 Anna F 2604 1880", - "2 Emma F 2003 1880", - "3 Elizabeth F 1939 1880", - "4 Minnie F 1746 1880", - "5 Margaret F 1578 1880", - "6 Ida F 1472 1880", - "7 Alice F 1414 1880", - "8 Bertha F 1320 1880", - "9 Sarah F 1288 1880" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "!head -n 10 datasets/babynames/yob1880.txt" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import pandas as pd", - "names1880 = pd.read_csv('datasets/babynames/yob1880.txt',", - " names=['name', 'sex', 'births'])", - "names1880" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names1880.groupby('sex').births.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "years = range(1880, 2011)", - "", - "pieces = []", - "columns = ['name', 'sex', 'births']", - "", - "for year in years:", - " path = 'datasets/babynames/yob%d.txt' % year", - " frame = pd.read_csv(path, names=columns)", - "", - " frame['year'] = year", - " pieces.append(frame)", - "", - "# Concatenate everything into a single DataFrame", - "names = pd.concat(pieces, ignore_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "total_births = names.pivot_table('births', index='year',", - " columns='sex', aggfunc=sum)", - "total_births.tail()", - "total_births.plot(title='Total births by sex and year')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def add_prop(group):", - " group['prop'] = group.births / group.births.sum()", - " return group", - "names = names.groupby(['year', 'sex']).apply(add_prop)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "names.groupby(['year', 'sex']).prop.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_top1000(group):", - " return group.sort_values(by='births', ascending=False)[:1000]", - "grouped = names.groupby(['year', 'sex'])", - "top1000 = grouped.apply(get_top1000)", - "# Drop the group index, not needed", - "top1000.reset_index(inplace=True, drop=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "pieces = []", - "for year, group in names.groupby(['year', 'sex']):", - " pieces.append(group.sort_values(by='births', ascending=False)[:1000])", - "top1000 = pd.concat(pieces, ignore_index=True)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "top1000" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Analyzing Naming Trends" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "boys = top1000[top1000.sex == 'M']", - "girls = top1000[top1000.sex == 'F']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "total_births = top1000.pivot_table('births', index='year',", - " columns='name',", - " aggfunc=sum)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "total_births.info()", - "subset = total_births[['John', 'Harry', 'Mary', 'Marilyn']]", - "subset.plot(subplots=True, figsize=(12, 10), grid=False,", - " title=\"Number of births per year\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Measuring the increase in naming diversity" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "table = top1000.pivot_table('prop', index='year',", - " columns='sex', aggfunc=sum)", - "table.plot(title='Sum of table1000.prop by year and sex',", - " yticks=np.linspace(0, 1.2, 13), xticks=range(1880, 2020, 10))" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = boys[boys.year == 2010]", - "df" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "prop_cumsum = df.sort_values(by='prop', ascending=False).prop.cumsum()", - "prop_cumsum[:10]", - "prop_cumsum.values.searchsorted(0.5)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "df = boys[boys.year == 1900]", - "in1900 = df.sort_values(by='prop', ascending=False).prop.cumsum()", - "in1900.values.searchsorted(0.5) + 1" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_quantile_count(group, q=0.5):", - " group = group.sort_values(by='prop', ascending=False)", - " return group.prop.cumsum().values.searchsorted(q) + 1", - "", - "diversity = top1000.groupby(['year', 'sex']).apply(get_quantile_count)", - "diversity = diversity.unstack('sex')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "diversity.head()", - "diversity.plot(title=\"Number of popular names in top 50%\")" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "The \u201clast letter\u201d revolution" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# extract last letter from name column", - "get_last_letter = lambda x: x[-1]", - "last_letters = names.name.map(get_last_letter)", - "last_letters.name = 'last_letter'", - "", - "table = names.pivot_table('births', index=last_letters,", - " columns=['sex', 'year'], aggfunc=sum)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "subtable = table.reindex(columns=[1910, 1960, 2010], level='year')", - "subtable.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "subtable.sum()", - "letter_prop = subtable / subtable.sum()", - "letter_prop" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import matplotlib.pyplot as plt", - "", - "fig, axes = plt.subplots(2, 1, figsize=(10, 8))", - "letter_prop['M'].plot(kind='bar', rot=0, ax=axes[0], title='Male')", - "letter_prop['F'].plot(kind='bar', rot=0, ax=axes[1], title='Female',", - " legend=False)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.subplots_adjust(hspace=0.25)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "letter_prop = table / table.sum()", - "dny_ts = letter_prop.loc[['d', 'n', 'y'], 'M'].T", - "dny_ts.head()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.close('all')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "dny_ts.plot()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "Boy names that became girl names (and vice versa)" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "all_names = pd.Series(top1000.name.unique())", - "lesley_like = all_names[all_names.str.lower().str.contains('lesl')]", - "lesley_like" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "filtered = top1000[top1000.name.isin(lesley_like)]", - "filtered.groupby('name').births.sum()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "table = filtered.pivot_table('births', index='year',", - " columns='sex', aggfunc='sum')", - "table = table.div(table.sum(1), axis=0)", - "table.tail()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "table.plot(style={'M': 'k-', 'F': 'k--'})" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "USDA Food Database" - ] - }, - { - "cell_type": "raw", - "metadata": {}, - "source": [ - "{", - " \"id\": 21441,", - " \"description\": \"KENTUCKY FRIED CHICKEN, Fried Chicken, EXTRA CRISPY,", - "Wing, meat and skin with breading\",", - " \"tags\": [\"KFC\"],", - " \"manufacturer\": \"Kentucky Fried Chicken\",", - " \"group\": \"Fast Foods\",", - " \"portions\": [", - " {", - " \"amount\": 1,", - " \"unit\": \"wing, with skin\",", - " \"grams\": 68.0", - " },", - "", - " ...", - " ],", - " \"nutrients\": [", - " {", - " \"value\": 20.8,", - " \"units\": \"g\",", - " \"description\": \"Protein\",", - " \"group\": \"Composition\"", - " },", - "", - " ...", - " ]", - "}" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "import json", - "db = json.load(open('datasets/usda_food/database.json'))", - "len(db)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "db[0].keys()", - "db[0]['nutrients'][0]", - "nutrients = pd.DataFrame(db[0]['nutrients'])", - "nutrients[:7]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "info_keys = ['description', 'group', 'id', 'manufacturer']", - "info = pd.DataFrame(db, columns=info_keys)", - "info[:5]", - "info.info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "pd.value_counts(info.group)[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients = []", - "", - "for rec in db:", - " fnuts = pd.DataFrame(rec['nutrients'])", - " fnuts['id'] = rec['id']", - " nutrients.append(fnuts)", - "", - "nutrients = pd.concat(nutrients, ignore_index=True)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "nutrients.duplicated().sum() # number of duplicates", - "nutrients = nutrients.drop_duplicates()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "col_mapping = {'description' : 'food',", - " 'group' : 'fgroup'}", - "info = info.rename(columns=col_mapping, copy=False)", - "info.info()", - "col_mapping = {'description' : 'nutrient',", - " 'group' : 'nutgroup'}", - "nutrients = nutrients.rename(columns=col_mapping, copy=False)", - "nutrients" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ndata = pd.merge(nutrients, info, on='id', how='outer')", - "ndata.info()", - "ndata.iloc[30000]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fig = plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "result = ndata.groupby(['nutrient', 'fgroup'])['value'].quantile(0.5)", - "result['Zinc, Zn'].sort_values().plot(kind='barh')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_nutrient = ndata.groupby(['nutgroup', 'nutrient'])", - "", - "get_maximum = lambda x: x.loc[x.value.idxmax()]", - "get_minimum = lambda x: x.loc[x.value.idxmin()]", - "", - "max_foods = by_nutrient.apply(get_maximum)[['value', 'food']]", - "", - "# make the food a little smaller", - "max_foods.food = max_foods.food.str[:50]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "max_foods.loc['Amino Acids']['food']" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "2012 Federal Election Commission Database" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec = pd.read_csv('datasets/fec/P00000001-ALL.csv')", - "fec.info()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.iloc[123456]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "unique_cands = fec.cand_nm.unique()", - "unique_cands", - "unique_cands[2]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "parties = {'Bachmann, Michelle': 'Republican',", - " 'Cain, Herman': 'Republican',", - " 'Gingrich, Newt': 'Republican',", - " 'Huntsman, Jon': 'Republican',", - " 'Johnson, Gary Earl': 'Republican',", - " 'McCotter, Thaddeus G': 'Republican',", - " 'Obama, Barack': 'Democrat',", - " 'Paul, Ron': 'Republican',", - " 'Pawlenty, Timothy': 'Republican',", - " 'Perry, Rick': 'Republican',", - " \"Roemer, Charles E. 'Buddy' III\": 'Republican',", - " 'Romney, Mitt': 'Republican',", - " 'Santorum, Rick': 'Republican'}" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.cand_nm[123456:123461]", - "fec.cand_nm[123456:123461].map(parties)", - "# Add it as a column", - "fec['party'] = fec.cand_nm.map(parties)", - "fec['party'].value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "(fec.contb_receipt_amt > 0).value_counts()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec = fec[fec.contb_receipt_amt > 0]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec_mrbo = fec[fec.cand_nm.isin(['Obama, Barack', 'Romney, Mitt'])]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Donation Statistics by Occupation and Employer" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "fec.contbr_occupation.value_counts()[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "occ_mapping = {", - " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',", - " 'INFORMATION REQUESTED' : 'NOT PROVIDED',", - " 'INFORMATION REQUESTED (BEST EFFORTS)' : 'NOT PROVIDED',", - " 'C.E.O.': 'CEO'", - "}", - "", - "# If no mapping provided, return x", - "f = lambda x: occ_mapping.get(x, x)", - "fec.contbr_occupation = fec.contbr_occupation.map(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "emp_mapping = {", - " 'INFORMATION REQUESTED PER BEST EFFORTS' : 'NOT PROVIDED',", - " 'INFORMATION REQUESTED' : 'NOT PROVIDED',", - " 'SELF' : 'SELF-EMPLOYED',", - " 'SELF EMPLOYED' : 'SELF-EMPLOYED',", - "}", - "", - "# If no mapping provided, return x", - "f = lambda x: emp_mapping.get(x, x)", - "fec.contbr_employer = fec.contbr_employer.map(f)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "by_occupation = fec.pivot_table('contb_receipt_amt',", - " index='contbr_occupation',", - " columns='party', aggfunc='sum')", - "over_2mm = by_occupation[by_occupation.sum(1) > 2000000]", - "over_2mm" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "over_2mm.plot(kind='barh')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "def get_top_amounts(group, key, n=5):", - " totals = group.groupby(key)['contb_receipt_amt'].sum()", - " return totals.nlargest(n)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = fec_mrbo.groupby('cand_nm')", - "grouped.apply(get_top_amounts, 'contbr_occupation', n=7)", - "grouped.apply(get_top_amounts, 'contbr_employer', n=10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Bucketing Donation Amounts" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bins = np.array([0, 1, 10, 100, 1000, 10000,", - " 100000, 1000000, 10000000])", - "labels = pd.cut(fec_mrbo.contb_receipt_amt, bins)", - "labels" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = fec_mrbo.groupby(['cand_nm', labels])", - "grouped.size().unstack(0)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "plt.figure()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "bucket_sums = grouped.contb_receipt_amt.sum().unstack(0)", - "normed_sums = bucket_sums.div(bucket_sums.sum(axis=1), axis=0)", - "normed_sums", - "normed_sums[:-2].plot(kind='barh')" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 3, - "metadata": {}, - "source": [ - "Donation Statistics by State" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "grouped = fec_mrbo.groupby(['cand_nm', 'contbr_st'])", - "totals = grouped.contb_receipt_amt.sum().unstack(0).fillna(0)", - "totals = totals[totals.sum(1) > 100000]", - "totals[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "percent = totals.div(totals.sum(1), axis=0)", - "percent[:10]" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%popd" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Conclusion" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.0" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +}