-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWorkingWithData.py
109 lines (92 loc) · 3.52 KB
/
WorkingWithData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import datetime
import math,random
from turtle import width
from typing import List,Dict
from collections import Counter, namedtuple
import matplotlib.pyplot as plt
import probabilty, vectors,statistics
from dateutil.parser import parse
import csv,re,tqdm
#this function takes a point and classifies it in a bucket in order to make a histogram of it at a alter point
def bucketize(point: float, bucketSize:float) -> float:
#divide by the size and floor it in order to classify data in buckets for the hiustorgram
#example: point is 53 and bucketsize is 10
#10*5=50 so we put it into 50 bucket
return bucketSize * math.floor(point/bucketSize)
def createHistogramDict(dataPoints : List[float], bucketSize: float) -> Dict[float,int]:
return Counter(bucketize(point,bucketSize) for point in dataPoints)
def plotHistogram(points: List[float], bucketSize: float, title: str = ""):
histogramVals = createHistogramDict(points,bucketSize)
plt.bar(histogramVals.keys(),histogramVals.values(),width=bucketSize)
plt.title(title)
plt.show()
uniform = [200*random.random() for i in range(5000)]
#lotHistogram(points=uniform,10,"test")
#returns a random point from a normal disturbution of mean 0 and stdev 1
def randomNormalSample()->float:
return probabilty.inverse_normal_cdf(random.random())
normalSample = [randomNormalSample() for i in range(5000)]
normalSample2 = [randomNormalSample() for i in range(5000)]
matrix=[normalSample,
normalSample2]
#plotHistogram(points=normalSample,bucketSize=0.1,title="normal Test")
#to look at multi dimensional data we can try a correlation matrix, whos i,jth entry is the correlation between these 2 datapoints
#each row and column are a list of data themselves so we take the correlation between each datda set
def correleationMatrix(data:List[List]):
def iAndjCorrelation(i:int,j:int)->float:
return statistics.correlation(data[i],data[j])
return vectors.make_matrix(len(data),len(data),iAndjCorrelation)
#print(correleationMatrix(matrix))
heights = [168, 175, 160, 182, 155, 190, 165, 178, 172, 159, 185, 173, 161, 188, 166, 177, 158, 170, 181, 176]
weights = [65, 70, 58, 80, 52, 90, 62, 75, 68, 56, 78, 70, 60, 85, 63, 72, 55, 67, 79, 74]
#returns stdev and mean for each positon
def scale(data: List[List[float]]):
n= len(data[0])
means = vectors.vectorMeans(data)
stdevs = [statistics.stdev([vector[i] for vector in data])
for i in range(n)]
return means,stdevs
data = [
[168, 65],
[175, 70],
[160, 58],
[182, 80],
[155, 52],
[190, 90],
[165, 62],
[178, 75],
[172, 68],
[159, 56],
[185, 78],
[173, 70],
[161, 60],
[188, 85],
[166, 63],
[177, 72],
[158, 55],
[170, 67],
[181, 79],
[176, 74]
]
def rescale(data: List[List[float]]) -> List[List[float]]:
"""
Rescales the input data so that each position has
mean 0 and standard deviation 1. (Leaves a position
as is if its standard deviation is 0.)
"""
dim = len(data[0])
means, stdevs = scale(data)
print(stdevs)
# Make a copy of each vector
rescaled = [v[:] for v in data]
for v in rescaled:
for i in range(dim):
if stdevs[i] > 0:
#subtract mean and divide by stdev for that dimension for each data set
v[i] = (v[i] - means[i]) / stdevs[i]
return rescaled
data=(rescale(data))
#print(vectors.vectorMeans(data))
for i in tqdm.tqdm(range(10)):
test=[random.random()for i in range(10000000)]
print("done")