diff --git a/scripts/bench_join.R b/scripts/bench_join.R deleted file mode 100644 index edba277f0295c..0000000000000 --- a/scripts/bench_join.R +++ /dev/null @@ -1,50 +0,0 @@ -library(xts) - -iterations <- 50 - -ns = c(100, 1000, 10000, 100000, 1000000) -kinds = c("outer", "left", "inner") - -result = matrix(0, nrow=3, ncol=length(ns)) -n <- 100000 -pct.overlap <- 0.2 - -k <- 1 - -for (ni in 1:length(ns)){ - n <- ns[ni] - rng1 <- 1:n - offset <- as.integer(n * pct.overlap) - rng2 <- rng1 + offset - x <- xts(matrix(rnorm(n * k), nrow=n, ncol=k), - as.POSIXct(Sys.Date()) + rng1) - y <- xts(matrix(rnorm(n * k), nrow=n, ncol=k), - as.POSIXct(Sys.Date()) + rng2) - timing <- numeric() - for (i in 1:3) { - kind = kinds[i] - for(j in 1:iterations) { - gc() # just to be sure - timing[j] <- system.time(merge(x,y,join=kind))[3] - } - #timing <- system.time(for (j in 1:iterations) merge.xts(x, y, join=kind), - # gcFirst=F) - #timing <- as.list(timing) - result[i, ni] <- mean(timing) * 1000 - #result[i, ni] = (timing$elapsed / iterations) * 1000 - } -} - -rownames(result) <- kinds -colnames(result) <- log10(ns) - -mat <- matrix(rnorm(500000), nrow=100000, ncol=5) -set.seed(12345) -indexer <- sample(1:100000) - -timing <- rep(0, 10) -for (i in 1:10) { - gc() - timing[i] = system.time(mat[indexer,])[3] -} - diff --git a/scripts/bench_join.py b/scripts/bench_join.py deleted file mode 100644 index f9d43772766d8..0000000000000 --- a/scripts/bench_join.py +++ /dev/null @@ -1,211 +0,0 @@ -from pandas.compat import range, lrange -import numpy as np -import pandas._libs.lib as lib -from pandas import * -from copy import deepcopy -import time - -n = 1000000 -K = 1 -pct_overlap = 0.2 - -a = np.arange(n, dtype=np.int64) -b = np.arange(n * pct_overlap, n * (1 + pct_overlap), dtype=np.int64) - -dr1 = DatetimeIndex('1/1/2000', periods=n, offset=offsets.Minute()) -dr2 = DatetimeIndex( - dr1[int(pct_overlap * n)], periods=n, offset=offsets.Minute(2)) - -aobj = a.astype(object) -bobj = b.astype(object) - -av = np.random.randn(n) -bv = np.random.randn(n) - -avf = np.random.randn(n, K) -bvf = np.random.randn(n, K) - -a_series = Series(av, index=a) -b_series = Series(bv, index=b) - -a_frame = DataFrame(avf, index=a, columns=lrange(K)) -b_frame = DataFrame(bvf, index=b, columns=lrange(K, 2 * K)) - - -def do_left_join(a, b, av, bv): - out = np.empty((len(a), 2)) - lib.left_join_1d(a, b, av, bv, out) - return out - - -def do_outer_join(a, b, av, bv): - result_index, aindexer, bindexer = lib.outer_join_indexer(a, b) - result = np.empty((2, len(result_index))) - lib.take_1d(av, aindexer, result[0]) - lib.take_1d(bv, bindexer, result[1]) - return result_index, result - - -def do_inner_join(a, b, av, bv): - result_index, aindexer, bindexer = lib.inner_join_indexer(a, b) - result = np.empty((2, len(result_index))) - lib.take_1d(av, aindexer, result[0]) - lib.take_1d(bv, bindexer, result[1]) - return result_index, result - -from line_profiler import LineProfiler -prof = LineProfiler() - -from pandas.util.testing import set_trace - - -def do_left_join_python(a, b, av, bv): - indexer, mask = lib.ordered_left_join_int64(a, b) - - n, ak = av.shape - _, bk = bv.shape - result_width = ak + bk - - result = np.empty((result_width, n), dtype=np.float64) - result[:ak] = av.T - - bchunk = result[ak:] - _take_multi(bv.T, indexer, bchunk) - np.putmask(bchunk, np.tile(mask, bk), np.nan) - return result - - -def _take_multi(data, indexer, out): - if not data.flags.c_contiguous: - data = data.copy() - for i in range(data.shape[0]): - data[i].take(indexer, out=out[i]) - - -def do_left_join_multi(a, b, av, bv): - n, ak = av.shape - _, bk = bv.shape - result = np.empty((n, ak + bk), dtype=np.float64) - lib.left_join_2d(a, b, av, bv, result) - return result - - -def do_outer_join_multi(a, b, av, bv): - n, ak = av.shape - _, bk = bv.shape - result_index, rindexer, lindexer = lib.outer_join_indexer(a, b) - result = np.empty((len(result_index), ak + bk), dtype=np.float64) - lib.take_join_contiguous(av, bv, lindexer, rindexer, result) - # result = np.empty((ak + bk, len(result_index)), dtype=np.float64) - # lib.take_axis0(av, rindexer, out=result[:ak].T) - # lib.take_axis0(bv, lindexer, out=result[ak:].T) - return result_index, result - - -def do_inner_join_multi(a, b, av, bv): - n, ak = av.shape - _, bk = bv.shape - result_index, rindexer, lindexer = lib.inner_join_indexer(a, b) - result = np.empty((len(result_index), ak + bk), dtype=np.float64) - lib.take_join_contiguous(av, bv, lindexer, rindexer, result) - # result = np.empty((ak + bk, len(result_index)), dtype=np.float64) - # lib.take_axis0(av, rindexer, out=result[:ak].T) - # lib.take_axis0(bv, lindexer, out=result[ak:].T) - return result_index, result - - -def do_left_join_multi_v2(a, b, av, bv): - indexer, mask = lib.ordered_left_join_int64(a, b) - bv_taken = bv.take(indexer, axis=0) - np.putmask(bv_taken, mask.repeat(bv.shape[1]), np.nan) - return np.concatenate((av, bv_taken), axis=1) - - -def do_left_join_series(a, b): - return b.reindex(a.index) - - -def do_left_join_frame(a, b): - a.index._indexMap = None - b.index._indexMap = None - return a.join(b, how='left') - - -# a = np.array([1, 2, 3, 4, 5], dtype=np.int64) -# b = np.array([0, 3, 5, 7, 9], dtype=np.int64) -# print(lib.inner_join_indexer(a, b)) - -out = np.empty((10, 120000)) - - -def join(a, b, av, bv, how="left"): - func_dict = {'left': do_left_join_multi, - 'outer': do_outer_join_multi, - 'inner': do_inner_join_multi} - - f = func_dict[how] - return f(a, b, av, bv) - - -def bench_python(n=100000, pct_overlap=0.20, K=1): - import gc - ns = [2, 3, 4, 5, 6] - iterations = 200 - pct_overlap = 0.2 - kinds = ['outer', 'left', 'inner'] - - all_results = {} - for logn in ns: - n = 10 ** logn - a = np.arange(n, dtype=np.int64) - b = np.arange(n * pct_overlap, n * pct_overlap + n, dtype=np.int64) - - avf = np.random.randn(n, K) - bvf = np.random.randn(n, K) - - a_frame = DataFrame(avf, index=a, columns=lrange(K)) - b_frame = DataFrame(bvf, index=b, columns=lrange(K, 2 * K)) - - all_results[logn] = result = {} - - for kind in kinds: - gc.disable() - elapsed = 0 - _s = time.clock() - for i in range(iterations): - if i % 10 == 0: - elapsed += time.clock() - _s - gc.collect() - _s = time.clock() - a_frame.join(b_frame, how=kind) - # join(a, b, avf, bvf, how=kind) - elapsed += time.clock() - _s - gc.enable() - result[kind] = (elapsed / iterations) * 1000 - - return DataFrame(all_results, index=kinds) - - -def bench_xts(n=100000, pct_overlap=0.20): - from pandas.rpy.common import r - r('a <- 5') - - xrng = '1:%d' % n - - start = n * pct_overlap + 1 - end = n + start - 1 - yrng = '%d:%d' % (start, end) - - r('library(xts)') - - iterations = 500 - - kinds = ['left', 'outer', 'inner'] - result = {} - for kind in kinds: - r('x <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, xrng)) - r('y <- xts(rnorm(%d), as.POSIXct(Sys.Date()) + %s)' % (n, yrng)) - stmt = 'for (i in 1:%d) merge(x, y, join="%s")' % (iterations, kind) - elapsed = r('as.list(system.time(%s, gcFirst=F))$elapsed' % stmt)[0] - result[kind] = (elapsed / iterations) * 1000 - return Series(result) diff --git a/scripts/bench_join_multi.py b/scripts/bench_join_multi.py deleted file mode 100644 index b19da6a2c47d8..0000000000000 --- a/scripts/bench_join_multi.py +++ /dev/null @@ -1,32 +0,0 @@ -from pandas import * - -import numpy as np -from pandas.compat import zip, range, lzip -from pandas.util.testing import rands -import pandas._libs.lib as lib - -N = 100000 - -key1 = [rands(10) for _ in range(N)] -key2 = [rands(10) for _ in range(N)] - -zipped = lzip(key1, key2) - - -def _zip(*args): - arr = np.empty(N, dtype=object) - arr[:] = lzip(*args) - return arr - - -def _zip2(*args): - return lib.list_to_object_array(lzip(*args)) - -index = MultiIndex.from_arrays([key1, key2]) -to_join = DataFrame({'j1': np.random.randn(100000)}, index=index) - -data = DataFrame({'A': np.random.randn(500000), - 'key1': np.repeat(key1, 5), - 'key2': np.repeat(key2, 5)}) - -# data.join(to_join, on=['key1', 'key2']) diff --git a/scripts/bench_refactor.py b/scripts/bench_refactor.py deleted file mode 100644 index dafba371e995a..0000000000000 --- a/scripts/bench_refactor.py +++ /dev/null @@ -1,51 +0,0 @@ -from pandas import * -from pandas.compat import range -try: - import pandas.core.internals as internals - reload(internals) - import pandas.core.frame as frame - reload(frame) - from pandas.core.frame import DataFrame as DataMatrix -except ImportError: - pass - -N = 1000 -K = 500 - - -def horribly_unconsolidated(): - index = np.arange(N) - - df = DataMatrix(index=index) - - for i in range(K): - df[i] = float(K) - - return df - - -def bench_reindex_index(df, it=100): - new_idx = np.arange(0, N, 2) - for i in range(it): - df.reindex(new_idx) - - -def bench_reindex_columns(df, it=100): - new_cols = np.arange(0, K, 2) - for i in range(it): - df.reindex(columns=new_cols) - - -def bench_join_index(df, it=10): - left = df.reindex(index=np.arange(0, N, 2), - columns=np.arange(K // 2)) - right = df.reindex(columns=np.arange(K // 2 + 1, K)) - for i in range(it): - joined = left.join(right) - -if __name__ == '__main__': - df = horribly_unconsolidated() - left = df.reindex(index=np.arange(0, N, 2), - columns=np.arange(K // 2)) - right = df.reindex(columns=np.arange(K // 2 + 1, K)) - bench_join_index(df) diff --git a/scripts/boxplot_test.py b/scripts/boxplot_test.py deleted file mode 100644 index 3704f7b60dc60..0000000000000 --- a/scripts/boxplot_test.py +++ /dev/null @@ -1,14 +0,0 @@ -import matplotlib.pyplot as plt - -import random -import pandas.util.testing as tm -tm.N = 1000 -df = tm.makeTimeDataFrame() -import string -foo = list(string.letters[:5]) * 200 -df['indic'] = list(string.letters[:5]) * 200 -random.shuffle(foo) -df['indic2'] = foo -df.boxplot(by=['indic', 'indic2'], fontsize=8, rot=90) - -plt.show() diff --git a/scripts/count_code.sh b/scripts/count_code.sh deleted file mode 100755 index 991faf2e8711b..0000000000000 --- a/scripts/count_code.sh +++ /dev/null @@ -1 +0,0 @@ -cloc pandas --force-lang=Python,pyx --not-match-f="parser.c|lib.c|tslib.c|sandbox.c|hashtable.c|sparse.c|algos.c|index.c" \ No newline at end of file diff --git a/scripts/faster_xs.py b/scripts/faster_xs.py deleted file mode 100644 index 2bb6271124c4f..0000000000000 --- a/scripts/faster_xs.py +++ /dev/null @@ -1,15 +0,0 @@ -import numpy as np - -import pandas.util.testing as tm - -from pandas.core.internals import _interleaved_dtype - -df = tm.makeDataFrame() - -df['E'] = 'foo' -df['F'] = 'foo' -df['G'] = 2 -df['H'] = df['A'] > 0 - -blocks = df._data.blocks -items = df.columns diff --git a/scripts/file_sizes.py b/scripts/file_sizes.py deleted file mode 100644 index de03c72ffbd09..0000000000000 --- a/scripts/file_sizes.py +++ /dev/null @@ -1,208 +0,0 @@ -from __future__ import print_function -import os -import sys - -import numpy as np -import matplotlib.pyplot as plt - -from pandas import DataFrame -from pandas.util.testing import set_trace -from pandas import compat - -dirs = [] -names = [] -lengths = [] - -if len(sys.argv) > 1: - loc = sys.argv[1] -else: - loc = '.' -walked = os.walk(loc) - - -def _should_count_file(path): - return path.endswith('.py') or path.endswith('.pyx') - - -def _is_def_line(line): - """def/cdef/cpdef, but not `cdef class`""" - return (line.endswith(':') and not 'class' in line.split() and - (line.startswith('def ') or - line.startswith('cdef ') or - line.startswith('cpdef ') or - ' def ' in line or ' cdef ' in line or ' cpdef ' in line)) - - -class LengthCounter(object): - """ - should add option for subtracting nested function lengths?? - """ - def __init__(self, lines): - self.lines = lines - self.pos = 0 - self.counts = [] - self.n = len(lines) - - def get_counts(self): - self.pos = 0 - self.counts = [] - while self.pos < self.n: - line = self.lines[self.pos] - self.pos += 1 - if _is_def_line(line): - level = _get_indent_level(line) - self._count_function(indent_level=level) - return self.counts - - def _count_function(self, indent_level=1): - indent = ' ' * indent_level - - def _end_of_function(line): - return (line != '' and - not line.startswith(indent) and - not line.startswith('#')) - - start_pos = self.pos - while self.pos < self.n: - line = self.lines[self.pos] - if _end_of_function(line): - self._push_count(start_pos) - return - - self.pos += 1 - - if _is_def_line(line): - self._count_function(indent_level=indent_level + 1) - - # end of file - self._push_count(start_pos) - - def _push_count(self, start_pos): - func_lines = self.lines[start_pos:self.pos] - - if len(func_lines) > 300: - set_trace() - - # remove blank lines at end - while len(func_lines) > 0 and func_lines[-1] == '': - func_lines = func_lines[:-1] - - # remove docstrings and comments - clean_lines = [] - in_docstring = False - for line in func_lines: - line = line.strip() - if in_docstring and _is_triplequote(line): - in_docstring = False - continue - - if line.startswith('#'): - continue - - if _is_triplequote(line): - in_docstring = True - continue - - self.counts.append(len(func_lines)) - - -def _get_indent_level(line): - level = 0 - while line.startswith(' ' * level): - level += 1 - return level - - -def _is_triplequote(line): - return line.startswith('"""') or line.startswith("'''") - - -def _get_file_function_lengths(path): - lines = [x.rstrip() for x in open(path).readlines()] - counter = LengthCounter(lines) - return counter.get_counts() - -# def test_get_function_lengths(): -text = """ -class Foo: - -def foo(): - def bar(): - a = 1 - - b = 2 - - c = 3 - - foo = 'bar' - -def x(): - a = 1 - - b = 3 - - c = 7 - - pass -""" - -expected = [5, 8, 7] - -lines = [x.rstrip() for x in text.splitlines()] -counter = LengthCounter(lines) -result = counter.get_counts() -assert(result == expected) - - -def doit(): - for directory, _, files in walked: - print(directory) - for path in files: - if not _should_count_file(path): - continue - - full_path = os.path.join(directory, path) - print(full_path) - lines = len(open(full_path).readlines()) - - dirs.append(directory) - names.append(path) - lengths.append(lines) - - result = DataFrame({'dirs': dirs, 'names': names, - 'lengths': lengths}) - - -def doit2(): - counts = {} - for directory, _, files in walked: - print(directory) - for path in files: - if not _should_count_file(path) or path.startswith('test_'): - continue - - full_path = os.path.join(directory, path) - counts[full_path] = _get_file_function_lengths(full_path) - - return counts - -counts = doit2() - -# counts = _get_file_function_lengths('pandas/tests/test_series.py') - -all_counts = [] -for k, v in compat.iteritems(counts): - all_counts.extend(v) -all_counts = np.array(all_counts) - -fig = plt.figure(figsize=(10, 5)) -ax = fig.add_subplot(111) -ax.hist(all_counts, bins=100) -n = len(all_counts) -nmore = (all_counts > 50).sum() -ax.set_title('%s function lengths, n=%d' % ('pandas', n)) -ax.set_ylabel('N functions') -ax.set_xlabel('Function length') -ax.text(100, 300, '%.3f%% with > 50 lines' % ((n - nmore) / float(n)), - fontsize=18) -plt.show() diff --git a/scripts/gen_release_notes.py b/scripts/gen_release_notes.py deleted file mode 100644 index 7e4ffca59a0ab..0000000000000 --- a/scripts/gen_release_notes.py +++ /dev/null @@ -1,95 +0,0 @@ -from __future__ import print_function -import sys -import json -from pandas.io.common import urlopen -from datetime import datetime - - -class Milestone(object): - - def __init__(self, title, number): - self.title = title - self.number = number - - def __eq__(self, other): - if isinstance(other, Milestone): - return self.number == other.number - return False - - -class Issue(object): - - def __init__(self, title, labels, number, milestone, body, state): - self.title = title - self.labels = set([x['name'] for x in labels]) - self.number = number - self.milestone = milestone - self.body = body - self.closed = state == 'closed' - - def __eq__(self, other): - if isinstance(other, Issue): - return self.number == other.number - return False - - -def get_issues(): - all_issues = [] - page_number = 1 - while True: - iss = _get_page(page_number) - if len(iss) == 0: - break - page_number += 1 - all_issues.extend(iss) - return all_issues - - -def _get_page(page_number): - gh_url = ('https://api.github.com/repos/pandas-dev/pandas/issues?' - 'milestone=*&state=closed&assignee=*&page=%d') % page_number - with urlopen(gh_url) as resp: - rs = resp.readlines()[0] - jsondata = json.loads(rs) - issues = [Issue(x['title'], x['labels'], x['number'], - get_milestone(x['milestone']), x['body'], x['state']) - for x in jsondata] - return issues - - -def get_milestone(data): - if data is None: - return None - return Milestone(data['title'], data['number']) - - -def collate_label(issues, label): - lines = [] - for x in issues: - if label in x.labels: - lines.append('\t- %s(#%d)' % (x.title, x.number)) - - return '\n'.join(lines) - - -def release_notes(milestone): - issues = get_issues() - - headers = ['New Features', 'Improvements to existing features', - 'API Changes', 'Bug fixes'] - labels = ['New', 'Enhancement', 'API-Change', 'Bug'] - - rs = 'pandas %s' % milestone - rs += '\n' + ('=' * len(rs)) - rs += '\n\n **Release date:** %s' % datetime.today().strftime('%B %d, %Y') - for i, h in enumerate(headers): - rs += '\n\n**%s**\n\n' % h - l = labels[i] - rs += collate_label(issues, l) - - return rs - -if __name__ == '__main__': - - rs = release_notes(sys.argv[1]) - print(rs) diff --git a/scripts/git-mrb b/scripts/git-mrb deleted file mode 100644 index c15e6dbf9f51a..0000000000000 --- a/scripts/git-mrb +++ /dev/null @@ -1,82 +0,0 @@ -#!/usr/bin/env python -"""git-mrb: merge remote branch. - -git mrb [remote:branch OR remote-branch] [onto] [upstream] - -remote must be locally available, and branch must exist in that remote. - -If 'onto' branch isn't given, default is 'master'. - -If 'upstream' repository isn't given, default is 'origin'. - -You can separate the remote and branch spec with either a : or a -. - -Taken from IPython project -""" -#----------------------------------------------------------------------------- -# Imports -#----------------------------------------------------------------------------- - -from subprocess import check_call -import sys - -#----------------------------------------------------------------------------- -# Functions -#----------------------------------------------------------------------------- - -def sh(cmd): - cmd = cmd.format(**shvars) - print('$', cmd) - check_call(cmd, shell=True) - -#----------------------------------------------------------------------------- -# Main Script -#----------------------------------------------------------------------------- - -argv = sys.argv[1:] -narg = len(argv) - -try: - branch_spec = argv[0] - sep = ':' if ':' in branch_spec else '-' - remote, branch = branch_spec.split(':', 1) - if not branch: - raise ValueError('Branch spec %s invalid, branch not found' % - branch_spec) -except: - import traceback as tb - tb.print_exc() - print(__doc__) - sys.exit(1) - -onto = argv[1] if narg >= 2 else 'master' -upstream = argv[1] if narg == 3 else 'origin' - -# Git doesn't like ':' in branch names. -if sep == ':': - branch_spec = branch_spec.replace(':', '-') - -# Global used by sh -shvars = dict(remote=remote, branch_spec=branch_spec, branch=branch, - onto=onto, upstream=upstream) - -# Start git calls. -sh('git fetch {remote}') -sh('git checkout -b {branch_spec} {onto}') -sh('git merge {remote}/{branch}') - -print(""" -************************************************************* - Run test suite. If tests pass, run the following to merge: - -git checkout {onto} -git merge {branch_spec} -git push {upstream} {onto} - -************************************************************* -""".format(**shvars)) - -ans = raw_input("Revert to master and delete temporary branch? [Y/n]: ") -if ans.strip().lower() in ('', 'y', 'yes'): - sh('git checkout {onto}') - sh('git branch -D {branch_spec}') \ No newline at end of file diff --git a/scripts/git_code_churn.py b/scripts/git_code_churn.py deleted file mode 100644 index 18c9b244a6ba0..0000000000000 --- a/scripts/git_code_churn.py +++ /dev/null @@ -1,34 +0,0 @@ -import subprocess -import os -import re -import sys - -import numpy as np - -from pandas import * - - -if __name__ == '__main__': - from vbench.git import GitRepo - repo = GitRepo('/Users/wesm/code/pandas') - churn = repo.get_churn_by_file() - - file_include = [] - for path in churn.major_axis: - if path.endswith('.pyx') or path.endswith('.py'): - file_include.append(path) - commits_include = [sha for sha in churn.minor_axis - if 'LF' not in repo.messages[sha]] - commits_include.remove('dcf3490') - - clean_churn = churn.reindex(major=file_include, minor=commits_include) - - by_commit = clean_churn.sum('major').sum(1) - - by_date = by_commit.groupby(repo.commit_date).sum() - - by_date = by_date.drop([datetime(2011, 6, 10)]) - - # clean out days where I touched Cython - - by_date = by_date[by_date < 5000] diff --git a/scripts/groupby_sample.py b/scripts/groupby_sample.py deleted file mode 100644 index 42008858d3cad..0000000000000 --- a/scripts/groupby_sample.py +++ /dev/null @@ -1,54 +0,0 @@ -from pandas import * -import numpy as np -import string -import pandas.compat as compat - -g1 = np.array(list(string.letters))[:-1] -g2 = np.arange(510) -df_small = DataFrame({'group1': ["a", "b", "a", "a", "b", "c", "c", "c", "c", - "c", "a", "a", "a", "b", "b", "b", "b"], - 'group2': [1, 2, 3, 4, 1, 3, 5, 6, 5, 4, 1, 2, 3, 4, 3, 2, 1], - 'value': ["apple", "pear", "orange", "apple", - "banana", "durian", "lemon", "lime", - "raspberry", "durian", "peach", "nectarine", - "banana", "lemon", "guava", "blackberry", - "grape"]}) -value = df_small['value'].values.repeat(3) -df = DataFrame({'group1': g1.repeat(4000 * 5), - 'group2': np.tile(g2, 400 * 5), - 'value': value.repeat(4000 * 5)}) - - -def random_sample(): - grouped = df.groupby(['group1', 'group2'])['value'] - from random import choice - choose = lambda group: choice(group.index) - indices = grouped.apply(choose) - return df.reindex(indices) - - -def random_sample_v2(): - grouped = df.groupby(['group1', 'group2'])['value'] - from random import choice - choose = lambda group: choice(group.index) - indices = [choice(v) for k, v in compat.iteritems(grouped.groups)] - return df.reindex(indices) - - -def do_shuffle(arr): - from random import shuffle - result = arr.copy().values - shuffle(result) - return result - - -def shuffle_uri(df, grouped): - perm = np.r_[tuple([np.random.permutation( - idxs) for idxs in compat.itervalues(grouped.groups)])] - df['state_permuted'] = np.asarray(df.ix[perm]['value']) - -df2 = df.copy() -grouped = df2.groupby('group1') -shuffle_uri(df2, grouped) - -df2['state_perm'] = grouped['value'].transform(do_shuffle) diff --git a/scripts/groupby_speed.py b/scripts/groupby_speed.py deleted file mode 100644 index 3be9fac12418e..0000000000000 --- a/scripts/groupby_speed.py +++ /dev/null @@ -1,35 +0,0 @@ -from __future__ import print_function -from pandas import * - -rng = DatetimeIndex('1/3/2011', '11/30/2011', offset=offsets.Minute()) - -df = DataFrame(np.random.randn(len(rng), 5), index=rng, - columns=list('OHLCV')) - -rng5 = DatetimeIndex('1/3/2011', '11/30/2011', offset=offsets.Minute(5)) -gp = rng5.asof -grouped = df.groupby(gp) - - -def get1(dt): - k = gp(dt) - return grouped.get_group(k) - - -def get2(dt): - k = gp(dt) - return df.ix[grouped.groups[k]] - - -def f(): - for i, date in enumerate(df.index): - if i % 10000 == 0: - print(i) - get1(date) - - -def g(): - for i, date in enumerate(df.index): - if i % 10000 == 0: - print(i) - get2(date) diff --git a/scripts/groupby_test.py b/scripts/groupby_test.py deleted file mode 100644 index f640a6ed79503..0000000000000 --- a/scripts/groupby_test.py +++ /dev/null @@ -1,145 +0,0 @@ -from collections import defaultdict - -from numpy import nan -import numpy as np - -from pandas import * - -import pandas._libs.lib as tseries -import pandas.core.groupby as gp -import pandas.util.testing as tm -from pandas.compat import range -reload(gp) - -""" - -k = 1000 -values = np.random.randn(8 * k) -key1 = np.array(['foo', 'bar', 'baz', 'bar', 'foo', 'baz', 'bar', 'baz'] * k, - dtype=object) -key2 = np.array(['b', 'b', 'b', 'b', 'a', 'a', 'a', 'a' ] * k, - dtype=object) -shape, labels, idicts = gp.labelize(key1, key2) - -print(tseries.group_labels(key1)) - -# print(shape) -# print(labels) -# print(idicts) - -result = tseries.group_aggregate(values, labels, shape) - -print(tseries.groupby_indices(key2)) - -df = DataFrame({'key1' : key1, - 'key2' : key2, - 'v1' : values, - 'v2' : values}) -k1 = df['key1'] -k2 = df['key2'] - -# del df['key1'] -# del df['key2'] - -# r2 = gp.multi_groupby(df, np.sum, k1, k2) - -# print(result) - -gen = gp.generate_groups(df['v1'], labels, shape, axis=1, - factory=DataFrame) - -res = defaultdict(dict) -for a, gen1 in gen: - for b, group in gen1: - print(a, b) - print(group) - # res[b][a] = group['values'].sum() - res[b][a] = group.sum() - -res = DataFrame(res) - -grouped = df.groupby(['key1', 'key2']) -""" - -# data = {'A' : [0, 0, 0, 0, 1, 1, 1, 1, 1, 1., nan, nan], -# 'B' : ['A', 'B'] * 6, -# 'C' : np.random.randn(12)} -# df = DataFrame(data) -# df['C'][2:10:2] = nan - -# single column -# grouped = df.drop(['B'], axis=1).groupby('A') -# exp = {} -# for cat, group in grouped: -# exp[cat] = group['C'].sum() -# exp = DataFrame({'C' : exp}) -# result = grouped.sum() - -# grouped = df.groupby(['A', 'B']) -# expd = {} -# for cat1, cat2, group in grouped: -# expd.setdefault(cat1, {})[cat2] = group['C'].sum() -# exp = DataFrame(expd).T.stack() -# result = grouped.sum()['C'] - -# print('wanted') -# print(exp) -# print('got') -# print(result) - -# tm.N = 10000 - -# mapping = {'A': 0, 'C': 1, 'B': 0, 'D': 1} -# tf = lambda x: x - x.mean() - -# df = tm.makeTimeDataFrame() -# ts = df['A'] - -# # grouped = df.groupby(lambda x: x.strftime('%m/%y')) -# grouped = df.groupby(mapping, axis=1) -# groupedT = df.T.groupby(mapping, axis=0) - -# r1 = groupedT.transform(tf).T -# r2 = grouped.transform(tf) - -# fillit = lambda x: x.fillna(method='pad') - -# f = lambda x: x - -# transformed = df.groupby(lambda x: x.strftime('%m/%y')).transform(lambda -# x: x) - -# def ohlc(group): -# return Series([group[0], group.max(), group.min(), group[-1]], -# index=['open', 'high', 'low', 'close']) -# grouper = [lambda x: x.year, lambda x: x.month] -# dr = DateRange('1/1/2000', '1/1/2002') -# ts = Series(np.random.randn(len(dr)), index=dr) - -# import string - -# k = 20 -# n = 1000 - -# keys = list(string.letters[:k]) - -# df = DataFrame({'A' : np.tile(keys, n), -# 'B' : np.repeat(keys[:k/2], n * 2), -# 'C' : np.random.randn(k * n)}) - -# def f(): -# for x in df.groupby(['A', 'B']): -# pass - -a = np.arange(100).repeat(100) -b = np.tile(np.arange(100), 100) -index = MultiIndex.from_arrays([a, b]) -s = Series(np.random.randn(len(index)), index) -df = DataFrame({'A': s}) -df['B'] = df.index.get_level_values(0) -df['C'] = df.index.get_level_values(1) - - -def f(): - for x in df.groupby(['B', 'B']): - pass diff --git a/scripts/hdfstore_panel_perf.py b/scripts/hdfstore_panel_perf.py deleted file mode 100644 index c66e9506fc4c5..0000000000000 --- a/scripts/hdfstore_panel_perf.py +++ /dev/null @@ -1,17 +0,0 @@ -from pandas import * -from pandas.util.testing import rands -from pandas.compat import range - -i, j, k = 7, 771, 5532 - -panel = Panel(np.random.randn(i, j, k), - items=[rands(10) for _ in range(i)], - major_axis=DatetimeIndex('1/1/2000', periods=j, - offset=offsets.Minute()), - minor_axis=[rands(10) for _ in range(k)]) - - -store = HDFStore('test.h5') -store.put('test_panel', panel, table=True) - -retrieved = store['test_panel'] diff --git a/scripts/json_manip.py b/scripts/json_manip.py deleted file mode 100644 index 7ff4547825568..0000000000000 --- a/scripts/json_manip.py +++ /dev/null @@ -1,423 +0,0 @@ -""" - -Tasks -------- - -Search and transform jsonable structures, specifically to make it 'easy' to make tabular/csv output for other consumers. - -Example -~~~~~~~~~~~~~ - - *give me a list of all the fields called 'id' in this stupid, gnarly - thing* - - >>> Q('id',gnarly_data) - ['id1','id2','id3'] - - -Observations: ---------------------- - -1) 'simple data structures' exist and are common. They are tedious - to search. - -2) The DOM is another nested / treeish structure, and jQuery selector is - a good tool for that. - -3a) R, Numpy, Excel and other analysis tools want 'tabular' data. These - analyses are valuable and worth doing. - -3b) Dot/Graphviz, NetworkX, and some other analyses *like* treeish/dicty - things, and those analyses are also worth doing! - -3c) Some analyses are best done using 'one-off' and custom code in C, Python, - or another 'real' programming language. - -4) Arbitrary transforms are tedious and error prone. SQL is one solution, - XSLT is another, - -5) the XPATH/XML/XSLT family is.... not universally loved :) They are - very complete, and the completeness can make simple cases... gross. - -6) For really complicated data structures, we can write one-off code. Getting - 80% of the way is mostly okay. There will always have to be programmers - in the loop. - -7) Re-inventing SQL is probably a failure mode. So is reinventing XPATH, XSLT - and the like. Be wary of mission creep! Re-use when possible (e.g., can - we put the thing into a DOM using - -8) If the interface is good, people can improve performance later. - - -Simplifying ---------------- - - -1) Assuming 'jsonable' structures - -2) keys are strings or stringlike. Python allows any hashable to be a key. - for now, we pretend that doesn't happen. - -3) assumes most dicts are 'well behaved'. DAG, no cycles! - -4) assume that if people want really specialized transforms, they can do it - themselves. - -""" -from __future__ import print_function - -from collections import namedtuple -import csv -import itertools -from itertools import product -from operator import attrgetter as aget, itemgetter as iget -import operator -import sys -from pandas.compat import map, u, callable, Counter -import pandas.compat as compat - - -## note 'url' appears multiple places and not all extensions have same struct -ex1 = { - 'name': 'Gregg', - 'extensions': [ - {'id':'hello', - 'url':'url1'}, - {'id':'gbye', - 'url':'url2', - 'more': dict(url='url3')}, - ] -} - -## much longer example -ex2 = {u('metadata'): {u('accessibilities'): [{u('name'): u('accessibility.tabfocus'), - u('value'): 7}, - {u('name'): u('accessibility.mouse_focuses_formcontrol'), u('value'): False}, - {u('name'): u('accessibility.browsewithcaret'), u('value'): False}, - {u('name'): u('accessibility.win32.force_disabled'), u('value'): False}, - {u('name'): u('accessibility.typeaheadfind.startlinksonly'), u('value'): False}, - {u('name'): u('accessibility.usebrailledisplay'), u('value'): u('')}, - {u('name'): u('accessibility.typeaheadfind.timeout'), u('value'): 5000}, - {u('name'): u('accessibility.typeaheadfind.enabletimeout'), u('value'): True}, - {u('name'): u('accessibility.tabfocus_applies_to_xul'), u('value'): False}, - {u('name'): u('accessibility.typeaheadfind.flashBar'), u('value'): 1}, - {u('name'): u('accessibility.typeaheadfind.autostart'), u('value'): True}, - {u('name'): u('accessibility.blockautorefresh'), u('value'): False}, - {u('name'): u('accessibility.browsewithcaret_shortcut.enabled'), - u('value'): True}, - {u('name'): u('accessibility.typeaheadfind.enablesound'), u('value'): True}, - {u('name'): u('accessibility.typeaheadfind.prefillwithselection'), - u('value'): True}, - {u('name'): u('accessibility.typeaheadfind.soundURL'), u('value'): u('beep')}, - {u('name'): u('accessibility.typeaheadfind'), u('value'): False}, - {u('name'): u('accessibility.typeaheadfind.casesensitive'), u('value'): 0}, - {u('name'): u('accessibility.warn_on_browsewithcaret'), u('value'): True}, - {u('name'): u('accessibility.usetexttospeech'), u('value'): u('')}, - {u('name'): u('accessibility.accesskeycausesactivation'), u('value'): True}, - {u('name'): u('accessibility.typeaheadfind.linksonly'), u('value'): False}, - {u('name'): u('isInstantiated'), u('value'): True}], - u('extensions'): [{u('id'): u('216ee7f7f4a5b8175374cd62150664efe2433a31'), - u('isEnabled'): True}, - {u('id'): u('1aa53d3b720800c43c4ced5740a6e82bb0b3813e'), u('isEnabled'): False}, - {u('id'): u('01ecfac5a7bd8c9e27b7c5499e71c2d285084b37'), u('isEnabled'): True}, - {u('id'): u('1c01f5b22371b70b312ace94785f7b0b87c3dfb2'), u('isEnabled'): True}, - {u('id'): u('fb723781a2385055f7d024788b75e959ad8ea8c3'), u('isEnabled'): True}], - u('fxVersion'): u('9.0'), - u('location'): u('zh-CN'), - u('operatingSystem'): u('WINNT Windows NT 5.1'), - u('surveyAnswers'): u(''), - u('task_guid'): u('d69fbd15-2517-45b5-8a17-bb7354122a75'), - u('tpVersion'): u('1.2'), - u('updateChannel'): u('beta')}, - u('survey_data'): { - u('extensions'): [{u('appDisabled'): False, - u('id'): u('testpilot?labs.mozilla.com'), - u('isCompatible'): True, - u('isEnabled'): True, - u('isPlatformCompatible'): True, - u('name'): u('Test Pilot')}, - {u('appDisabled'): True, - u('id'): u('dict?www.youdao.com'), - u('isCompatible'): False, - u('isEnabled'): False, - u('isPlatformCompatible'): True, - u('name'): u('Youdao Word Capturer')}, - {u('appDisabled'): False, - u('id'): u('jqs?sun.com'), - u('isCompatible'): True, - u('isEnabled'): True, - u('isPlatformCompatible'): True, - u('name'): u('Java Quick Starter')}, - {u('appDisabled'): False, - u('id'): u('?20a82645-c095-46ed-80e3-08825760534b?'), - u('isCompatible'): True, - u('isEnabled'): True, - u('isPlatformCompatible'): True, - u('name'): u('Microsoft .NET Framework Assistant')}, - {u('appDisabled'): False, - u('id'): u('?a0d7ccb3-214d-498b-b4aa-0e8fda9a7bf7?'), - u('isCompatible'): True, - u('isEnabled'): True, - u('isPlatformCompatible'): True, - u('name'): u('WOT')}], - u('version_number'): 1}} - -# class SurveyResult(object): - -# def __init__(self, record): -# self.record = record -# self.metadata, self.survey_data = self._flatten_results() - -# def _flatten_results(self): -# survey_data = self.record['survey_data'] -# extensions = DataFrame(survey_data['extensions']) - -def denorm(queries,iterable_of_things,default=None): - """ - 'repeat', or 'stutter' to 'tableize' for downstream. - (I have no idea what a good word for this is!) - - Think ``kronecker`` products, or: - - ``SELECT single,multiple FROM table;`` - - single multiple - ------- --------- - id1 val1 - id1 val2 - - - Args: - - queries: iterable of ``Q`` queries. - iterable_of_things: to be queried. - - Returns: - - list of 'stuttered' output, where if a query returns - a 'single', it gets repeated appropriately. - - - """ - - def _denorm(queries,thing): - fields = [] - results = [] - for q in queries: - #print(q) - r = Ql(q,thing) - #print("-- result: ", r) - if not r: - r = [default] - if isinstance(r[0], type({})): - fields.append(sorted(r[0].keys())) # dicty answers - else: - fields.append([q]) # stringy answer - - results.append(r) - - #print(results) - #print(fields) - flist = list(flatten(*map(iter,fields))) - - prod = itertools.product(*results) - for p in prod: - U = dict() - for (ii,thing) in enumerate(p): - #print(ii,thing) - if isinstance(thing, type({})): - U.update(thing) - else: - U[fields[ii][0]] = thing - - yield U - - return list(flatten(*[_denorm(queries,thing) for thing in iterable_of_things])) - - -def default_iget(fields,default=None,): - """ itemgetter with 'default' handling, that *always* returns lists - - API CHANGES from ``operator.itemgetter`` - - Note: Sorry to break the iget api... (fields vs *fields) - Note: *always* returns a list... unlike itemgetter, - which can return tuples or 'singles' - """ - myiget = operator.itemgetter(*fields) - L = len(fields) - def f(thing): - try: - ans = list(myiget(thing)) - if L < 2: - ans = [ans,] - return ans - except KeyError: - # slower! - return [thing.get(x,default) for x in fields] - - f.__doc__ = "itemgetter with default %r for fields %r" %(default,fields) - f.__name__ = "default_itemgetter" - return f - - -def flatten(*stack): - """ - helper function for flattening iterables of generators in a - sensible way. - """ - stack = list(stack) - while stack: - try: x = next(stack[0]) - except StopIteration: - stack.pop(0) - continue - if hasattr(x,'next') and callable(getattr(x,'next')): - stack.insert(0, x) - - #if isinstance(x, (GeneratorType,listerator)): - else: yield x - - -def _Q(filter_, thing): - """ underlying machinery for Q function recursion """ - T = type(thing) - if isinstance({}, T): - for k,v in compat.iteritems(thing): - #print(k,v) - if filter_ == k: - if isinstance(v, type([])): - yield iter(v) - else: - yield v - - if type(v) in (type({}),type([])): - yield Q(filter_,v) - - elif isinstance([], T): - for k in thing: - #print(k) - yield Q(filter_,k) - - else: - # no recursion. - pass - -def Q(filter_,thing): - """ - type(filter): - - list: a flattened list of all searches (one list) - - dict: dict with vals each of which is that search - - Notes: - - [1] 'parent thing', with space, will do a descendent - [2] this will come back 'flattened' jQuery style - [3] returns a generator. Use ``Ql`` if you want a list. - - """ - if isinstance(filter_, type([])): - return flatten(*[_Q(x,thing) for x in filter_]) - elif isinstance(filter_, type({})): - d = dict.fromkeys(list(filter_.keys())) - #print(d) - for k in d: - #print(flatten(Q(k,thing))) - d[k] = Q(k,thing) - - return d - - else: - if " " in filter_: # i.e. "antecendent post" - parts = filter_.strip().split() - r = None - for p in parts: - r = Ql(p,thing) - thing = r - - return r - - else: # simple. - return flatten(_Q(filter_,thing)) - -def Ql(filter_,thing): - """ same as Q, but returns a list, not a generator """ - res = Q(filter_,thing) - - if isinstance(filter_, type({})): - for k in res: - res[k] = list(res[k]) - return res - - else: - return list(res) - - - -def countit(fields,iter_of_iter,default=None): - """ - note: robust to fields not being in i_of_i, using ``default`` - """ - C = Counter() # needs hashables - T = namedtuple("Thing",fields) - get = default_iget(*fields,default=default) - return Counter( - (T(*get(thing)) for thing in iter_of_iter) - ) - - -## right now this works for one row... -def printout(queries,things,default=None, f=sys.stdout, **kwargs): - """ will print header and objects - - **kwargs go to csv.DictWriter - - help(csv.DictWriter) for more. - """ - - results = denorm(queries,things,default=None) - fields = set(itertools.chain(*(x.keys() for x in results))) - - W = csv.DictWriter(f=f,fieldnames=fields,**kwargs) - #print("---prod---") - #print(list(prod)) - W.writeheader() - for r in results: - W.writerow(r) - - -def test_run(): - print("\n>>> print(list(Q('url',ex1)))") - print(list(Q('url',ex1))) - assert list(Q('url',ex1)) == ['url1','url2','url3'] - assert Ql('url',ex1) == ['url1','url2','url3'] - - print("\n>>> print(list(Q(['name','id'],ex1)))") - print(list(Q(['name','id'],ex1))) - assert Ql(['name','id'],ex1) == ['Gregg','hello','gbye'] - - - print("\n>>> print(Ql('more url',ex1))") - print(Ql('more url',ex1)) - - - print("\n>>> list(Q('extensions',ex1))") - print(list(Q('extensions',ex1))) - - print("\n>>> print(Ql('extensions',ex1))") - print(Ql('extensions',ex1)) - - print("\n>>> printout(['name','extensions'],[ex1,], extrasaction='ignore')") - printout(['name','extensions'],[ex1,], extrasaction='ignore') - - print("\n\n") - - from pprint import pprint as pp - - print("-- note that the extension fields are also flattened! (and N/A) -- ") - pp(denorm(['location','fxVersion','notthere','survey_data extensions'],[ex2,], default="N/A")[:2]) - - -if __name__ == "__main__": - pass diff --git a/scripts/leak.py b/scripts/leak.py deleted file mode 100644 index 47f74bf020597..0000000000000 --- a/scripts/leak.py +++ /dev/null @@ -1,13 +0,0 @@ -from pandas import * -from pandas.compat import range -import numpy as np -import pandas.util.testing as tm -import os -import psutil - -pid = os.getpid() -proc = psutil.Process(pid) - -df = DataFrame(index=np.arange(100)) -for i in range(5000): - df[i] = 5 diff --git a/scripts/parser_magic.py b/scripts/parser_magic.py deleted file mode 100644 index 72fef39d8db65..0000000000000 --- a/scripts/parser_magic.py +++ /dev/null @@ -1,74 +0,0 @@ -from pandas.util.testing import set_trace -import pandas.util.testing as tm -import pandas.compat as compat - -from pandas import * -import ast -import inspect -import sys - - -def merge(a, b): - f, args, _ = parse_stmt(inspect.currentframe().f_back) - return DataFrame({args[0]: a, - args[1]: b}) - - -def parse_stmt(frame): - info = inspect.getframeinfo(frame) - call = info[-2][0] - mod = ast.parse(call) - body = mod.body[0] - if isinstance(body, (ast.Assign, ast.Expr)): - call = body.value - elif isinstance(body, ast.Call): - call = body - return _parse_call(call) - - -def _parse_call(call): - func = _maybe_format_attribute(call.func) - - str_args = [] - for arg in call.args: - if isinstance(arg, ast.Name): - str_args.append(arg.id) - elif isinstance(arg, ast.Call): - formatted = _format_call(arg) - str_args.append(formatted) - - return func, str_args, {} - - -def _format_call(call): - func, args, kwds = _parse_call(call) - content = '' - if args: - content += ', '.join(args) - if kwds: - fmt_kwds = ['%s=%s' % item for item in compat.iteritems(kwds)] - joined_kwds = ', '.join(fmt_kwds) - if args: - content = content + ', ' + joined_kwds - else: - content += joined_kwds - return '%s(%s)' % (func, content) - - -def _maybe_format_attribute(name): - if isinstance(name, ast.Attribute): - return _format_attribute(name) - return name.id - - -def _format_attribute(attr): - obj = attr.value - if isinstance(attr.value, ast.Attribute): - obj = _format_attribute(attr.value) - else: - obj = obj.id - return '.'.join((obj, attr.attr)) - -a = tm.makeTimeSeries() -b = tm.makeTimeSeries() -df = merge(a, b) diff --git a/scripts/preepoch_test.py b/scripts/preepoch_test.py deleted file mode 100644 index 36a3d768e671f..0000000000000 --- a/scripts/preepoch_test.py +++ /dev/null @@ -1,23 +0,0 @@ -import numpy as np -from pandas import * - - -def panda_test(): - - # generate some data - data = np.random.rand(50, 5) - # generate some dates - dates = DatetimeIndex('1/1/1969', periods=50) - # generate column headings - cols = ['A', 'B', 'C', 'D', 'E'] - - df = DataFrame(data, index=dates, columns=cols) - - # save to HDF5Store - store = HDFStore('bugzilla.h5', mode='w') - store['df'] = df # This gives: OverflowError: mktime argument out of range - store.close() - - -if __name__ == '__main__': - panda_test() diff --git a/scripts/pypistats.py b/scripts/pypistats.py deleted file mode 100644 index 41343f6d30c76..0000000000000 --- a/scripts/pypistats.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" -Calculates the total number of downloads that a particular PyPI package has -received across all versions tracked by PyPI -""" - -from datetime import datetime -import locale -import sys -import xmlrpclib -import pandas as pd - -locale.setlocale(locale.LC_ALL, '') - - -class PyPIDownloadAggregator(object): - - def __init__(self, package_name, include_hidden=True): - self.package_name = package_name - self.include_hidden = include_hidden - self.proxy = xmlrpclib.Server('http://pypi.python.org/pypi') - self._downloads = {} - - @property - def releases(self): - """Retrieves the release number for each uploaded release""" - - result = self.proxy.package_releases(self.package_name, - self.include_hidden) - - if len(result) == 0: - # no matching package--search for possibles, and limit to 15 - # results - results = self.proxy.search({ - 'name': self.package_name, - 'description': self.package_name - }, 'or')[:15] - - # make sure we only get unique package names - matches = [] - for match in results: - name = match['name'] - if name not in matches: - matches.append(name) - - # if only one package was found, return it - if len(matches) == 1: - self.package_name = matches[0] - return self.releases - - error = """No such package found: %s - -Possible matches include: -%s -""" % (self.package_name, '\n'.join('\t- %s' % n for n in matches)) - - sys.exit(error) - - return result - - def get_downloads(self): - """Calculate the total number of downloads for the package""" - downloads = {} - for release in self.releases: - urls = self.proxy.release_urls(self.package_name, release) - urls = pd.DataFrame(urls) - urls['version'] = release - downloads[release] = urls - - return pd.concat(downloads, ignore_index=True) - -if __name__ == '__main__': - agg = PyPIDownloadAggregator('pandas') - - data = agg.get_downloads() - - to_omit = ['0.2b1', '0.2beta'] - - isostrings = data['upload_time'].map(lambda x: x.value) - data['upload_time'] = pd.to_datetime(isostrings) - - totals = data.groupby('version').downloads.sum() - rollup = {'0.8.0rc1': '0.8.0', - '0.8.0rc2': '0.8.0', - '0.3.0.beta': '0.3.0', - '0.3.0.beta2': '0.3.0'} - downloads = totals.groupby(lambda x: rollup.get(x, x)).sum() - - first_upload = data.groupby('version').upload_time.min() - - result = pd.DataFrame({'downloads': totals, - 'release_date': first_upload}) - result = result.sort('release_date') - result = result.drop(to_omit + list(rollup.keys())) - result.index.name = 'release' - - by_date = result.reset_index().set_index('release_date').downloads - dummy = pd.Series(index=pd.DatetimeIndex([datetime(2012, 12, 27)])) - by_date = by_date.append(dummy).shift(1).fillna(0) diff --git a/scripts/roll_median_leak.py b/scripts/roll_median_leak.py deleted file mode 100644 index 03f39e2b18372..0000000000000 --- a/scripts/roll_median_leak.py +++ /dev/null @@ -1,26 +0,0 @@ -from __future__ import print_function -from pandas import * - -import numpy as np -import os - -from vbench.api import Benchmark -from pandas.util.testing import rands -from pandas.compat import range -import pandas._libs.lib as lib -import pandas._sandbox as sbx -import time - -import psutil - -pid = os.getpid() -proc = psutil.Process(pid) - -lst = SparseList() -lst.append([5] * 10000) -lst.append(np.repeat(np.nan, 1000000)) - -for _ in range(10000): - print(proc.get_memory_info()) - sdf = SparseDataFrame({'A': lst.to_array()}) - chunk = sdf[sdf['A'] == 5] diff --git a/scripts/runtests.py b/scripts/runtests.py deleted file mode 100644 index e14752b43116b..0000000000000 --- a/scripts/runtests.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import print_function -import os -print(os.getpid()) -import nose -nose.main('pandas.core') diff --git a/scripts/test_py27.bat b/scripts/test_py27.bat deleted file mode 100644 index 11e3056287e31..0000000000000 --- a/scripts/test_py27.bat +++ /dev/null @@ -1,6 +0,0 @@ -SET PATH=C:\MinGW\bin;C:\Python27;C:\Python27\Scripts;%PATH% - -python setup.py clean -python setup.py build_ext -c mingw32 --inplace - -nosetests pandas \ No newline at end of file diff --git a/scripts/testmed.py b/scripts/testmed.py deleted file mode 100644 index dd3b952d58c60..0000000000000 --- a/scripts/testmed.py +++ /dev/null @@ -1,171 +0,0 @@ -## {{{ Recipe 576930 (r10): Efficient Running Median using an Indexable Skiplist - -from random import random -from math import log, ceil -from pandas.compat import range -from numpy.random import randn -from pandas.lib.skiplist import rolling_median - - -class Node(object): - __slots__ = 'value', 'next', 'width' - - def __init__(self, value, next, width): - self.value, self.next, self.width = value, next, width - - -class End(object): - 'Sentinel object that always compares greater than another object' - def __cmp__(self, other): - return 1 - -NIL = Node(End(), [], []) # Singleton terminator node - - -class IndexableSkiplist: - 'Sorted collection supporting O(lg n) insertion, removal, and lookup by rank.' - - def __init__(self, expected_size=100): - self.size = 0 - self.maxlevels = int(1 + log(expected_size, 2)) - self.head = Node('HEAD', [NIL] * self.maxlevels, [1] * self.maxlevels) - - def __len__(self): - return self.size - - def __getitem__(self, i): - node = self.head - i += 1 - for level in reversed(range(self.maxlevels)): - while node.width[level] <= i: - i -= node.width[level] - node = node.next[level] - return node.value - - def insert(self, value): - # find first node on each level where node.next[levels].value > value - chain = [None] * self.maxlevels - steps_at_level = [0] * self.maxlevels - node = self.head - for level in reversed(range(self.maxlevels)): - while node.next[level].value <= value: - steps_at_level[level] += node.width[level] - node = node.next[level] - chain[level] = node - - # insert a link to the newnode at each level - d = min(self.maxlevels, 1 - int(log(random(), 2.0))) - newnode = Node(value, [None] * d, [None] * d) - steps = 0 - for level in range(d): - prevnode = chain[level] - newnode.next[level] = prevnode.next[level] - prevnode.next[level] = newnode - newnode.width[level] = prevnode.width[level] - steps - prevnode.width[level] = steps + 1 - steps += steps_at_level[level] - for level in range(d, self.maxlevels): - chain[level].width[level] += 1 - self.size += 1 - - def remove(self, value): - # find first node on each level where node.next[levels].value >= value - chain = [None] * self.maxlevels - node = self.head - for level in reversed(range(self.maxlevels)): - while node.next[level].value < value: - node = node.next[level] - chain[level] = node - if value != chain[0].next[0].value: - raise KeyError('Not Found') - - # remove one link at each level - d = len(chain[0].next[0].next) - for level in range(d): - prevnode = chain[level] - prevnode.width[level] += prevnode.next[level].width[level] - 1 - prevnode.next[level] = prevnode.next[level].next[level] - for level in range(d, self.maxlevels): - chain[level].width[level] -= 1 - self.size -= 1 - - def __iter__(self): - 'Iterate over values in sorted order' - node = self.head.next[0] - while node is not NIL: - yield node.value - node = node.next[0] - -from collections import deque -from itertools import islice - - -class RunningMedian: - 'Fast running median with O(lg n) updates where n is the window size' - - def __init__(self, n, iterable): - from pandas.lib.skiplist import IndexableSkiplist as skiplist - - self.it = iter(iterable) - self.queue = deque(islice(self.it, n)) - self.skiplist = IndexableSkiplist(n) - for elem in self.queue: - self.skiplist.insert(elem) - - def __iter__(self): - queue = self.queue - skiplist = self.skiplist - midpoint = len(queue) // 2 - yield skiplist[midpoint] - for newelem in self.it: - oldelem = queue.popleft() - skiplist.remove(oldelem) - queue.append(newelem) - skiplist.insert(newelem) - yield skiplist[midpoint] - -N = 100000 -K = 10000 - -import time - - -def test(): - from numpy.random import randn - - arr = randn(N) - - def _test(arr, k): - meds = RunningMedian(k, arr) - return list(meds) - - _test(arr, K) - - - -def test2(): - - arr = randn(N) - - return rolling_median(arr, K) - - -def runmany(f, arr, arglist): - timings = [] - - for arg in arglist: - tot = 0 - for i in range(5): - tot += _time(f, arr, arg) - timings.append(tot / 5) - - return timings - - -def _time(f, *args): - _start = time.clock() - result = f(*args) - return time.clock() - _start - -if __name__ == '__main__': - test2() diff --git a/scripts/touchup_gh_issues.py b/scripts/touchup_gh_issues.py deleted file mode 100755 index 8aa6d426156f0..0000000000000 --- a/scripts/touchup_gh_issues.py +++ /dev/null @@ -1,44 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from __future__ import print_function -from collections import OrderedDict -import sys -import re - -""" -Reads in stdin, replace all occurences of '#num' or 'GH #num' with -links to github issue. dumps the issue anchors before the next -section header -""" - -pat = "((?:\s*GH\s*)?)#(\d{3,4})([^_]|$)?" -rep_pat = r"\1GH\2_\3" -anchor_pat = ".. _GH{id}: https://github.com/pandas-dev/pandas/issues/{id}" -section_pat = "^pandas\s[\d\.]+\s*$" - - -def main(): - issues = OrderedDict() - while True: - - line = sys.stdin.readline() - if not line: - break - - if re.search(section_pat, line): - for id in issues: - print(anchor_pat.format(id=id).rstrip()) - if issues: - print("\n") - issues = OrderedDict() - - for m in re.finditer(pat, line): - id = m.group(2) - if id not in issues: - issues[id] = True - print(re.sub(pat, rep_pat, line).rstrip()) - pass - -if __name__ == "__main__": - main() diff --git a/scripts/use_build_cache.py b/scripts/use_build_cache.py deleted file mode 100755 index f8c2df2a8a45d..0000000000000 --- a/scripts/use_build_cache.py +++ /dev/null @@ -1,354 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -import os - -""" -This script should be run from the repo root dir, it rewrites setup.py -to use the build cache directory specified in the envar BUILD_CACHE_DIR -or in a file named .build_cache_dir in the repo root directory. - -Artifacts included in the cache: -- gcc artifacts -- The .c files resulting from cythonizing pyx/d files -- 2to3 refactoring results (when run under python3) - -Tested on releases back to 0.7.0. - -""" - -try: - import argparse - argparser = argparse.ArgumentParser(description=""" - 'Program description. - """.strip()) - - argparser.add_argument('-f', '--force-overwrite', - default=False, - help='Setting this will overwrite any existing cache results for the current commit', - action='store_true') - argparser.add_argument('-d', '--debug', - default=False, - help='Report cache hits/misses', - action='store_true') - - args = argparser.parse_args() -except: - class Foo(object): - debug=False - force_overwrite=False - - args = Foo() # for 2.6, no argparse - -#print(args.accumulate(args.integers)) - -shim=""" -import os -import sys -import shutil -import warnings -import re -""" - -shim += ("BC_FORCE_OVERWRITE = %s\n" % args.force_overwrite) -shim += ("BC_DEBUG = %s\n" % args.debug) - -shim += """ -try: - if not ("develop" in sys.argv) and not ("install" in sys.argv): - 1/0 - basedir = os.path.dirname(__file__) - dotfile = os.path.join(basedir,".build_cache_dir") - BUILD_CACHE_DIR = "" - if os.path.exists(dotfile): - BUILD_CACHE_DIR = open(dotfile).readline().strip() - BUILD_CACHE_DIR = os.environ.get('BUILD_CACHE_DIR',BUILD_CACHE_DIR) - - if os.path.isdir(BUILD_CACHE_DIR): - print("--------------------------------------------------------") - print("BUILD CACHE ACTIVATED (V2). be careful, this is experimental.") - print("BUILD_CACHE_DIR: " + BUILD_CACHE_DIR ) - print("--------------------------------------------------------") - else: - BUILD_CACHE_DIR = None - - # retrieve 2to3 artifacts - if sys.version_info[0] >= 3: - from lib2to3 import refactor - from hashlib import sha1 - import shutil - import multiprocessing - pyver = "%d.%d" % (sys.version_info[:2]) - fileq = ["pandas"] - to_process = dict() - - # retrieve the hashes existing in the cache - orig_hashes=dict() - post_hashes=dict() - for path,dirs,files in os.walk(os.path.join(BUILD_CACHE_DIR,'pandas')): - for f in files: - s=f.split(".py-")[-1] - try: - prev_h,post_h,ver = s.split('-') - if ver == pyver: - orig_hashes[prev_h] = os.path.join(path,f) - post_hashes[post_h] = os.path.join(path,f) - except: - pass - - while fileq: - f = fileq.pop() - - if os.path.isdir(f): - fileq.extend([os.path.join(f,x) for x in os.listdir(f)]) - else: - if not f.endswith(".py"): - continue - else: - try: - h = sha1(open(f,"rb").read()).hexdigest() - except IOError: - to_process[h] = f - else: - if h in orig_hashes and not BC_FORCE_OVERWRITE: - src = orig_hashes[h] - if BC_DEBUG: - print("2to3 cache hit %s,%s" % (f,h)) - shutil.copyfile(src,f) - elif h not in post_hashes: - # we're not in a dev dir with already processed files - if BC_DEBUG: - print("2to3 cache miss (will process) %s,%s" % (f,h)) - to_process[h] = f - - avail_fixes = set(refactor.get_fixers_from_package("lib2to3.fixes")) - avail_fixes.discard('lib2to3.fixes.fix_next') - t=refactor.RefactoringTool(avail_fixes) - if to_process: - print("Starting 2to3 refactoring...") - for orig_h,f in to_process.items(): - if BC_DEBUG: - print("2to3 on %s" % f) - try: - t.refactor([f],True) - post_h = sha1(open(f, "rb").read()).hexdigest() - cached_fname = f + '-' + orig_h + '-' + post_h + '-' + pyver - path = os.path.join(BUILD_CACHE_DIR, cached_fname) - pathdir =os.path.dirname(path) - if BC_DEBUG: - print("cache put %s in %s" % (f, path)) - try: - os.makedirs(pathdir) - except OSError as exc: - import errno - if exc.errno == errno.EEXIST and os.path.isdir(pathdir): - pass - else: - raise - - shutil.copyfile(f, path) - - except Exception as e: - print("While processing %s 2to3 raised: %s" % (f,str(e))) - - pass - print("2to3 done refactoring.") - -except Exception as e: - if not isinstance(e,ZeroDivisionError): - print( "Exception: " + str(e)) - BUILD_CACHE_DIR = None - -class CompilationCacheMixin(object): - def __init__(self, *args, **kwds): - cache_dir = kwds.pop("cache_dir", BUILD_CACHE_DIR) - self.cache_dir = cache_dir - if not os.path.isdir(cache_dir): - raise Exception("Error: path to Cache directory (%s) is not a dir" % cache_dir) - - def _copy_from_cache(self, hash, target): - src = os.path.join(self.cache_dir, hash) - if os.path.exists(src) and not BC_FORCE_OVERWRITE: - if BC_DEBUG: - print("Cache HIT: asked to copy file %s in %s" % - (src,os.path.abspath(target))) - s = "." - for d in target.split(os.path.sep)[:-1]: - s = os.path.join(s, d) - if not os.path.exists(s): - os.mkdir(s) - shutil.copyfile(src, target) - - return True - - return False - - def _put_to_cache(self, hash, src): - target = os.path.join(self.cache_dir, hash) - if BC_DEBUG: - print( "Cache miss: asked to copy file from %s to %s" % (src,target)) - s = "." - for d in target.split(os.path.sep)[:-1]: - s = os.path.join(s, d) - if not os.path.exists(s): - os.mkdir(s) - shutil.copyfile(src, target) - - def _hash_obj(self, obj): - try: - return hash(obj) - except: - raise NotImplementedError("You must override this method") - -class CompilationCacheExtMixin(CompilationCacheMixin): - def _hash_file(self, fname): - from hashlib import sha1 - f= None - try: - hash = sha1() - hash.update(self.build_lib.encode('utf-8')) - try: - if sys.version_info[0] >= 3: - import io - f = io.open(fname, "rb") - else: - f = open(fname) - - first_line = f.readline() - # ignore cython generation timestamp header - if "Generated by Cython" not in first_line.decode('utf-8'): - hash.update(first_line) - hash.update(f.read()) - return hash.hexdigest() - - except: - raise - return None - finally: - if f: - f.close() - - except IOError: - return None - - def _hash_obj(self, ext): - from hashlib import sha1 - - sources = ext.sources - if (sources is None or - (not hasattr(sources, '__iter__')) or - isinstance(sources, str) or - sys.version[0] == 2 and isinstance(sources, unicode)): # argh - return False - - sources = list(sources) + ext.depends - hash = sha1() - try: - for fname in sources: - fhash = self._hash_file(fname) - if fhash: - hash.update(fhash.encode('utf-8')) - except: - return None - - return hash.hexdigest() - - -class CachingBuildExt(build_ext, CompilationCacheExtMixin): - def __init__(self, *args, **kwds): - CompilationCacheExtMixin.__init__(self, *args, **kwds) - kwds.pop("cache_dir", None) - build_ext.__init__(self, *args, **kwds) - - def build_extension(self, ext, *args, **kwds): - ext_path = self.get_ext_fullpath(ext.name) - build_path = os.path.join(self.build_lib, os.path.basename(ext_path)) - - hash = self._hash_obj(ext) - if hash and self._copy_from_cache(hash, ext_path): - return - - build_ext.build_extension(self, ext, *args, **kwds) - - hash = self._hash_obj(ext) - if os.path.exists(build_path): - self._put_to_cache(hash, build_path) # build_ext - if os.path.exists(ext_path): - self._put_to_cache(hash, ext_path) # develop - - def cython_sources(self, sources, extension): - import re - cplus = self.cython_cplus or getattr(extension, 'cython_cplus', 0) or \ - (extension.language and extension.language.lower() == 'c++') - target_ext = '.c' - if cplus: - target_ext = '.cpp' - - for i, s in enumerate(sources): - if not re.search("\.(pyx|pxi|pxd)$", s): - continue - ext_dir = os.path.dirname(s) - ext_basename = re.sub("\.[^\.]+$", "", os.path.basename(s)) - ext_basename += target_ext - target = os.path.join(ext_dir, ext_basename) - hash = self._hash_file(s) - sources[i] = target - if hash and self._copy_from_cache(hash, target): - continue - build_ext.cython_sources(self, [s], extension) - self._put_to_cache(hash, target) - - sources = [x for x in sources if x.startswith("pandas") or "lib." in x] - - return sources - -if BUILD_CACHE_DIR: # use the cache - cmdclass['build_ext'] = CachingBuildExt - -try: - # recent - setuptools_kwargs['use_2to3'] = True if BUILD_CACHE_DIR is None else False -except: - pass - -try: - # pre eb2234231 , ~ 0.7.0, - setuptools_args['use_2to3'] = True if BUILD_CACHE_DIR is None else False -except: - pass - -""" -def main(): - opd = os.path.dirname - opj = os.path.join - s= None - with open(opj(opd(__file__),"..","setup.py")) as f: - s = f.read() - if s: - if "BUILD CACHE ACTIVATED (V2)" in s: - print( "setup.py already wired with V2 build_cache, skipping..") - else: - SEP="\nsetup(" - before,after = s.split(SEP) - with open(opj(opd(__file__),"..","setup.py"),"wb") as f: - f.write((before + shim + SEP + after).encode('ascii')) - print(""" - setup.py was rewritten to use a build cache. - Make sure you've put the following in your .bashrc: - - export BUILD_CACHE_DIR= - echo $BUILD_CACHE_DIR > pandas_repo_rootdir/.build_cache_dir - - Once active, build results (compilation, cythonizations and 2to3 artifacts) - will be cached in "$BUILD_CACHE_DIR" and subsequent builds should be - sped up if no changes requiring recompilation were made. - - Go ahead and run: - - python setup.py clean - python setup.py develop - - """) - -if __name__ == '__main__': - import sys - sys.exit(main()) diff --git a/scripts/winbuild_py27.bat b/scripts/winbuild_py27.bat deleted file mode 100644 index bec67c7e527ed..0000000000000 --- a/scripts/winbuild_py27.bat +++ /dev/null @@ -1,2 +0,0 @@ -SET PATH=C:\MinGW\bin;C:\Python27;C:\Python27\Scripts;%PATH% -python setup.py build -c mingw32 bdist_wininst