-
Notifications
You must be signed in to change notification settings - Fork 0
/
bigmart (2).R
102 lines (60 loc) · 1.86 KB
/
bigmart (2).R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
######## Environment Setup ########
Sys.setenv(HADOOP_CM="/usr/bin/hadoop/")
Sys.setenv(HADOOP_STREAMING="/usr/lib/hadoop-mapreduce/hadoopstreaming/hadoop-streaming.jar")
library(rmr2)
rmr.options(backend="local")
library(rhdfs)
hdfs.init()
######## Sampling Data ##############
#we take sample of 10% of the data to perfomm the operations.
bigmartnew <- read.csv("bigmartnew.csv")
glimpse(bigmartnew)
mydf = bigmartnew %>% select("Item_Visibility", "Profit_Margin") %>% sample_frac(0.1)
write.csv(mydf, file = "/home/cloudera/Desktop/mydf.csv")
######### Basic Hadoop Operation ######
#Put File from local file system to hdfs
hdfs.put('/home/cloudera/Desktop/mydf.csv','/wqd7009/')
hdfs.ls('/wqd7009/')
hdfs.chmod('/wqd7009/mydf.csv', permission ='777')
######### Data Modeling #######
#run linear regression + time it !
a.time=proc.time()
linearModel <- lm(Profit_Margin ~ Item_Visibility, data=mydf)
summary(linearModel)
linearModel
#mean(fitted.values(linearModel)) == mean(mydf$Profit_Margin)
#mean(residuals(linearModel))
plot(linearModel)
abline(linearModel)
proc.time() - a.time
#Using map reduce + timing it !
b.time=proc.time()
X= as.matrix(cbind((mydf$Item_Visibility), as.vector(rep(1,nrow(mydf)))))
X.index = to.dfs(cbind(1:nrow(X),X))
y = as.matrix(mydf$Profit_Margin)
Reducer = function(., YY)
keyval(1,list(Reduce('+' , YY)))
XtX =
values(
from.dfs(
mapreduce(
input= X.index,
map=
function(.,Xi){
Xi = Xi [,-1]
keyval(1,list(t(Xi) %*% Xi))},
reduce = Reducer,
combine = TRUE ))) [[1]]
Xty =
values(
from.dfs(
mapreduce(
input = X.index,
map = function(.,Xi){
yi = y[Xi[,1],]
Xi = Xi[,-1]
keyval(1,list(t(Xi) %*% yi))},
reduce = Reducer,
combine= TRUE)))[[1]]
solve(XtX, Xty)
proc.time()-b.time