The following data is daily bike rentals joined with weather data. The goal of my analysis is to predict the number of unregistered users.
day.original = read.csv('day.csv')
day = day.original
#Fixing these values, they were illogical considering the other values for the days immediately surrounding them.
#For the humidity, I found days that were similar in temperature and time of year, and also used values from the surrounding days.
#For the atemp values, I calculated a temp/atemp ratio from the previous day, and the following day, and used that to ratio to set the atemp value.
day$hum[69] = mean(day$hum[c(26, 90, 106, 407, 478, 68, 68, 70, 70)])
day$atemp[595] = day$temp[595] * mean(c(day$atemp[594]/day$temp[594], day$atemp[596]/day$temp[596]))
Feature creation and data manipulation:
#factor which is basically an interaction term between mnth and workingday. This separates out the weekend from the weekday for each month, creating a total of 24 levels, with each level being of the form Jan-Weekday, Jan-Weekend, Feb-Weekday, etc.
day$mnthWeekend = as.factor(day$mnth * 2 + day$workingday)
day$season = as.factor(day$season)
day$mnth = as.factor(day$mnth)
day$weekday = as.factor(day$weekday)
day$hum = abs(mean(day$hum) - day$hum)
day$weekend = 0
day$weekend[which(day$weekday == 5)] = 1
day$weekend[which(day$weekday == 6)] = 2
day$weekend[which(day$weekday == 0)] = 3
day$weekend = as.factor(day$weekend)
day$workingday = as.factor(day$workingday)
#Creating Separate Holiday factors. This helps for some of the models I will use, while the other models will fail with it on.
#day$holiday[which((day$mnth == 4) & (day$holiday == 1))] = 4
#day$holiday[which((day$mnth == 7) & (day$holiday == 1))] = 7
#day$holiday[which((day$mnth ==10) & (day$holiday == 1))] = 10
#day$holiday[which((day$mnth == 11) & (day$holiday == 1))] = 11
#day$holiday = as.factor(day$holiday)
day$holidayseasons = 0
day$holidayseasons[which((day$mnth == 4) & (day$holiday == 1))] = 4
day$holidayseasons[which((day$mnth == 7) & (day$holiday == 1))] = 7
day$holidayseasons[which((day$mnth ==10) & (day$holiday == 1))] = 10
day$holidayseasons[which((day$mnth == 11) & (day$holiday == 1))] = 11
day$holidayseasons = as.factor(day$holidayseasons)
# previous days and weeks bike rentals, both casual bike rentals and the full count of caual + registered members
day$previousCasual = 0
day$previousCasual[2:731] = day$casual[1:730]
day$previousweekCasual = 0
day$previousweekCasual[8:731] = day$casual[1:724]
day$previousAll = 0
day$previousAll[2:731] = day$cnt[1:730]
day$previousweekAll = 0
day$previousweekAll[8:731] = day$cnt[1:724]
day$weathersit = as.factor(day$weathersit)
# This week factor is mostly useful for plotting
day$week = 0
i = 0
for(x in 2:360)
{
if (day$weekday[x] == 0) {
i = i +1
day$week[x] = i
day$week[x+371] = i }
else
{
day$week[x] = i
day$week[x+371] = i }
}
day$week[361:365] = 52
# weeks which have substantially lower bike rentals
day$slowseason = 0
day$slowseason[which(day$week >=0 & day$week < 10)] = 1
day$slowseason[which(day$week >48)] = 1
day$slowseason = as.factor(day$slowseason)
#day$week = as.factor(day$week)
#ratio of yesterdays temperature to todays
day$temprat = 0
day$temprat[8:731] = day$atemp[8:731] / day$atemp[1:724]
#Temperature ratio as distance from 1
day$tempratAbs = abs(day$temprat -1)
#Average users per month on a weekend vs weekday basis
day$mnthAvgCasual = 0
for(x in 2:25)
{
day$mnthAvgCasual[which(day$mnthWeekend==x)] = mean(day$casual[which(day$mnthWeekend == x )])
}
day$mnthAvgCasualF = as.factor(day$mnthAvgCasual)
day$mnthAvgAll = 0
for(x in 2:25)
{
day$mnthAvgAll[which(day$mnthWeekend==x)] = mean(day$cnt[which(day$mnthWeekend == x )])
}
#The absolute value of Todays temperature divided by the average temperature for the month, minus 1.
day$mnthTempAvg = 0
for(x in 1:12)
{
day$mnthTempAvg[which(day$mnth==x)] = mean(day$atemp[which(day$mnth == x)])
}
day$mnthTempAvg = abs(1- day$atemp/ day$mnthTempAvg)
day$mnthHumAvg = 0
for(x in 1:12)
{
day$mnthHumAvg[which(day$mnth==x)] = mean(day$hum[which(day$mnth == x)])
}
day$mnthHumAvg = abs(1- day$hum/ day$mnthHumAvg)
day$seasonTempAvg = 0
for(x in 1:4)
{
day$seasonTempAvg[which(day$season ==x)] = mean(day$atemp[which(day$season == x)])
}
day$seasonWeekend = as.factor(as.numeric(day$season) * 2 + as.numeric(day$workingday))
day$seasonAvgCasual = 0
for(x in 3:12)
{
day$seasonAvgCasual[which(day$seasonWeekend==x)] = mean(day$casual[which(day$seasonWeekend == x )])
}
day$seasonAvgCasual = as.factor(day$seasonAvgCasual)
library(viridis)
## Loading required package: viridisLite
cols.heat = viridis(3410, begin=.1, end=.9, direction = -1)
day.cols = rep(0, times = 731)
for(x in 1: 731)
{
day.cols[x] = cols.heat[day$casual[x]]
}
#backup
daycopy = day
library(boot)
library(splines)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:boot':
##
## logit
cv.error=0
excluded=-which(day$casual<20)
formula.weather = (casual ~ yr:weekday+ yr:mnthAvgCasual:slowseason +season:weekday:windspeed + mnth + mnthWeekend:temp + previousCasual:weathersit + previousweekCasual:weathersit:hum + holiday:holidayseasons + seasonWeekend:tempratAbs:hum:windspeed)
#(casual ~ yr:weekday + yr:mnthAvgCasual:slowseason + season:weekday + mnth + mnth:weekend:atemp + hum:weekend + windspeed:weekend + previousCasual + previousweekCasual:weathersit:hum + holiday + season:temprat + previousAll:previousCasual)
#(casual ~ yr:seasonWeekend + mnthAvgCasual:weathersit + season+ season:weekend:atemp + hum:windspeed:weekend + weathersit:previousCasual + holiday)
glm.1 = glm(formula.weather, data=day)
summary(glm.1)
##
## Call:
## glm(formula = formula.weather, data = day)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -937.10 -104.82 -6.33 103.54 1043.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.292e+02 9.082e+01 1.423 0.155311
## mnth2 -1.386e+02 1.466e+02 -0.945 0.344841
## mnth3 -5.076e+02 1.446e+02 -3.511 0.000478
## mnth4 -1.519e+02 1.914e+02 -0.793 0.427908
## mnth5 3.545e+02 2.578e+02 1.375 0.169689
## mnth6 1.565e+03 3.350e+02 4.672 3.64e-06
## mnth7 4.709e+03 4.540e+02 10.372 < 2e-16
## mnth8 7.696e+02 5.151e+02 1.494 0.135620
## mnth9 1.044e+03 3.117e+02 3.348 0.000861
## mnth10 -2.526e+02 2.106e+02 -1.199 0.230871
## mnth11 -4.505e+02 2.042e+02 -2.206 0.027742
## mnth12 -7.012e+01 1.716e+02 -0.409 0.683020
## yr:weekday0 2.638e+02 8.138e+01 3.241 0.001252
## yr:weekday1 9.444e+01 5.602e+01 1.686 0.092294
## yr:weekday2 1.334e+02 5.352e+01 2.493 0.012907
## yr:weekday3 1.267e+02 5.126e+01 2.471 0.013720
## yr:weekday4 1.738e+02 5.149e+01 3.376 0.000781
## yr:weekday5 3.137e+02 5.236e+01 5.992 3.46e-09
## yr:weekday6 5.154e+02 8.028e+01 6.421 2.64e-10
## mnthWeekend2:temp 1.693e+03 3.938e+02 4.298 1.99e-05
## mnthWeekend3:temp 1.411e+02 3.779e+02 0.373 0.708902
## mnthWeekend4:temp 2.724e+03 4.704e+02 5.790 1.10e-08
## mnthWeekend5:temp 7.114e+02 3.757e+02 1.894 0.058738
## mnthWeekend6:temp 4.991e+03 3.548e+02 14.068 < 2e-16
## mnthWeekend7:temp 2.273e+03 2.886e+02 7.875 1.47e-14
## mnthWeekend8:temp 3.684e+03 4.094e+02 8.999 < 2e-16
## mnthWeekend9:temp 1.328e+03 3.517e+02 3.775 0.000175
## mnthWeekend10:temp 2.313e+03 4.195e+02 5.515 5.08e-08
## mnthWeekend11:temp 4.149e+02 4.053e+02 1.024 0.306398
## mnthWeekend12:temp -2.643e+01 4.754e+02 -0.056 0.955676
## mnthWeekend13:temp -1.390e+03 4.627e+02 -3.004 0.002771
## mnthWeekend14:temp -4.174e+03 5.823e+02 -7.169 2.10e-12
## mnthWeekend15:temp -5.325e+03 5.825e+02 -9.142 < 2e-16
## mnthWeekend16:temp 1.095e+03 7.392e+02 1.481 0.139004
## mnthWeekend17:temp -2.541e+01 7.187e+02 -0.035 0.971808
## mnthWeekend18:temp 1.266e+03 4.832e+02 2.620 0.009007
## mnthWeekend19:temp -7.228e+02 4.884e+02 -1.480 0.139442
## mnthWeekend20:temp 4.364e+03 4.455e+02 9.796 < 2e-16
## mnthWeekend21:temp 1.549e+03 3.739e+02 4.142 3.90e-05
## mnthWeekend22:temp 4.185e+03 5.193e+02 8.059 3.80e-15
## mnthWeekend23:temp 1.849e+03 4.836e+02 3.823 0.000145
## mnthWeekend24:temp 2.280e+03 5.153e+02 4.424 1.14e-05
## mnthWeekend25:temp 7.846e+02 4.490e+02 1.747 0.081052
## previousCasual:weathersit1 6.913e-02 2.583e-02 2.676 0.007641
## previousCasual:weathersit2 5.524e-02 2.818e-02 1.960 0.050404
## previousCasual:weathersit3 -1.077e-01 1.405e-01 -0.767 0.443587
## holiday:holidayseasons0 -3.484e+02 9.445e+01 -3.689 0.000245
## holiday:holidayseasons4 -1.285e+03 2.058e+02 -6.244 7.80e-10
## holiday:holidayseasons7 9.987e+02 1.794e+02 5.566 3.84e-08
## holiday:holidayseasons10 -9.049e+02 1.838e+02 -4.922 1.09e-06
## holiday:holidayseasons11 -6.661e+02 1.524e+02 -4.370 1.45e-05
## yr:mnthAvgCasual:slowseason0 1.022e-01 4.345e-02 2.353 0.018944
## yr:mnthAvgCasual:slowseason1 -3.700e-01 1.215e-01 -3.046 0.002412
## weekday0:season1:windspeed -1.895e+03 3.107e+02 -6.098 1.86e-09
## weekday1:season1:windspeed -3.400e+02 2.850e+02 -1.193 0.233294
## weekday2:season1:windspeed -4.334e+02 2.918e+02 -1.485 0.137934
## weekday3:season1:windspeed -3.996e+02 2.574e+02 -1.552 0.121114
## weekday4:season1:windspeed -6.439e+02 2.742e+02 -2.348 0.019157
## weekday5:season1:windspeed -5.127e+02 2.475e+02 -2.071 0.038725
## weekday6:season1:windspeed -2.440e+03 3.344e+02 -7.298 8.70e-13
## weekday0:season2:windspeed -1.037e+01 4.696e+02 -0.022 0.982388
## weekday1:season2:windspeed -3.896e+02 2.866e+02 -1.359 0.174486
## weekday2:season2:windspeed -7.255e+02 2.733e+02 -2.655 0.008134
## weekday3:season2:windspeed -6.648e+02 2.926e+02 -2.273 0.023383
## weekday4:season2:windspeed -2.631e+02 2.770e+02 -0.950 0.342525
## weekday5:season2:windspeed 4.346e+02 3.209e+02 1.354 0.176079
## weekday6:season2:windspeed -1.856e+02 4.495e+02 -0.413 0.679838
## weekday0:season3:windspeed -1.050e+03 4.550e+02 -2.307 0.021347
## weekday1:season3:windspeed -9.160e+02 3.748e+02 -2.444 0.014789
## weekday2:season3:windspeed -7.341e+02 3.649e+02 -2.012 0.044655
## weekday3:season3:windspeed -9.211e+02 4.016e+02 -2.293 0.022144
## weekday4:season3:windspeed -7.099e+02 3.592e+02 -1.976 0.048548
## weekday5:season3:windspeed -1.250e+02 3.952e+02 -0.316 0.751927
## weekday6:season3:windspeed -6.032e+02 4.187e+02 -1.441 0.150180
## weekday0:season4:windspeed -2.850e+03 4.113e+02 -6.930 1.03e-11
## weekday1:season4:windspeed -5.202e+02 3.746e+02 -1.389 0.165415
## weekday2:season4:windspeed -6.115e+02 3.390e+02 -1.804 0.071772
## weekday3:season4:windspeed -6.525e+02 3.236e+02 -2.016 0.044167
## weekday4:season4:windspeed -5.526e+02 2.865e+02 -1.929 0.054204
## weekday5:season4:windspeed 9.455e+01 3.269e+02 0.289 0.772487
## weekday6:season4:windspeed -1.273e+03 4.149e+02 -3.068 0.002245
## weathersit1:previousweekCasual:hum 4.550e-01 1.259e-01 3.614 0.000325
## weathersit2:previousweekCasual:hum -1.498e+00 1.611e-01 -9.298 < 2e-16
## weathersit3:previousweekCasual:hum -2.067e+00 5.096e-01 -4.056 5.60e-05
## windspeed:hum:seasonWeekend3:tempratAbs 4.801e+03 1.610e+03 2.981 0.002982
## windspeed:hum:seasonWeekend4:tempratAbs 7.881e+02 1.849e+03 0.426 0.670152
## windspeed:hum:seasonWeekend5:tempratAbs -1.667e+04 6.898e+03 -2.416 0.015965
## windspeed:hum:seasonWeekend6:tempratAbs -7.259e+02 2.425e+03 -0.299 0.764755
## windspeed:hum:seasonWeekend7:tempratAbs -4.181e+04 2.012e+04 -2.077 0.038168
## windspeed:hum:seasonWeekend8:tempratAbs 2.562e+04 1.394e+04 1.838 0.066462
## windspeed:hum:seasonWeekend9:tempratAbs 1.419e+03 6.180e+03 0.230 0.818473
## windspeed:hum:seasonWeekend10:tempratAbs -2.043e+03 5.774e+03 -0.354 0.723648
##
## (Intercept)
## mnth2
## mnth3 ***
## mnth4
## mnth5
## mnth6 ***
## mnth7 ***
## mnth8
## mnth9 ***
## mnth10
## mnth11 *
## mnth12
## yr:weekday0 **
## yr:weekday1 .
## yr:weekday2 *
## yr:weekday3 *
## yr:weekday4 ***
## yr:weekday5 ***
## yr:weekday6 ***
## mnthWeekend2:temp ***
## mnthWeekend3:temp
## mnthWeekend4:temp ***
## mnthWeekend5:temp .
## mnthWeekend6:temp ***
## mnthWeekend7:temp ***
## mnthWeekend8:temp ***
## mnthWeekend9:temp ***
## mnthWeekend10:temp ***
## mnthWeekend11:temp
## mnthWeekend12:temp
## mnthWeekend13:temp **
## mnthWeekend14:temp ***
## mnthWeekend15:temp ***
## mnthWeekend16:temp
## mnthWeekend17:temp
## mnthWeekend18:temp **
## mnthWeekend19:temp
## mnthWeekend20:temp ***
## mnthWeekend21:temp ***
## mnthWeekend22:temp ***
## mnthWeekend23:temp ***
## mnthWeekend24:temp ***
## mnthWeekend25:temp .
## previousCasual:weathersit1 **
## previousCasual:weathersit2 .
## previousCasual:weathersit3
## holiday:holidayseasons0 ***
## holiday:holidayseasons4 ***
## holiday:holidayseasons7 ***
## holiday:holidayseasons10 ***
## holiday:holidayseasons11 ***
## yr:mnthAvgCasual:slowseason0 *
## yr:mnthAvgCasual:slowseason1 **
## weekday0:season1:windspeed ***
## weekday1:season1:windspeed
## weekday2:season1:windspeed
## weekday3:season1:windspeed
## weekday4:season1:windspeed *
## weekday5:season1:windspeed *
## weekday6:season1:windspeed ***
## weekday0:season2:windspeed
## weekday1:season2:windspeed
## weekday2:season2:windspeed **
## weekday3:season2:windspeed *
## weekday4:season2:windspeed
## weekday5:season2:windspeed
## weekday6:season2:windspeed
## weekday0:season3:windspeed *
## weekday1:season3:windspeed *
## weekday2:season3:windspeed *
## weekday3:season3:windspeed *
## weekday4:season3:windspeed *
## weekday5:season3:windspeed
## weekday6:season3:windspeed
## weekday0:season4:windspeed ***
## weekday1:season4:windspeed
## weekday2:season4:windspeed .
## weekday3:season4:windspeed *
## weekday4:season4:windspeed .
## weekday5:season4:windspeed
## weekday6:season4:windspeed **
## weathersit1:previousweekCasual:hum ***
## weathersit2:previousweekCasual:hum ***
## weathersit3:previousweekCasual:hum ***
## windspeed:hum:seasonWeekend3:tempratAbs **
## windspeed:hum:seasonWeekend4:tempratAbs
## windspeed:hum:seasonWeekend5:tempratAbs *
## windspeed:hum:seasonWeekend6:tempratAbs
## windspeed:hum:seasonWeekend7:tempratAbs *
## windspeed:hum:seasonWeekend8:tempratAbs .
## windspeed:hum:seasonWeekend9:tempratAbs
## windspeed:hum:seasonWeekend10:tempratAbs
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 47040.88)
##
## Null deviance: 344158822 on 730 degrees of freedom
## Residual deviance: 30059120 on 639 degrees of freedom
## AIC: 10027
##
## Number of Fisher Scoring iterations: 2
vif(glm.1)
## GVIF Df GVIF^(1/(2*Df))
## mnth 1.937211e+19 11 7.528178
## yr:weekday 4.691561e+02 7 1.551702
## mnthWeekend:temp 2.043182e+22 24 2.916071
## previousCasual:weathersit 4.798553e+01 3 1.906273
## holiday:holidayseasons 1.206832e+01 5 1.282817
## yr:mnthAvgCasual:slowseason 1.848836e+01 2 2.073598
## weekday:season:windspeed 1.124336e+07 28 1.336315
## weathersit:previousweekCasual:hum 2.860791e+01 3 1.748830
## windspeed:hum:seasonWeekend:tempratAbs 4.682384e+02 8 1.468602
set.seed(1)
cv.error = cv.glm(day, glm.1)$delta[1]
cv.error
## [1] 59606.02
ResidualMagnitudeSquares = ( (abs(glm.1$residuals) + day$casual) / day$casual) ^ 2
# Residual Magnitude Sum of Squares
RMSS = sum( ResidualMagnitudeSquares)
RMSS
## [1] 1773.221
plot(1:731,ResidualMagnitudeSquares)
totalpredictednew = glm.1$fitted.values
totalpredicted1=0
totalpredicted2=0
mm=2
formula.predictive = (casual ~ yr:weekday+ yr:mnthAvgCasual +season:weekday + mnth + mnth:weekend:atemp + hum:weekend + windspeed:weekend+ previousCasual:weathersit + previousweekCasual:weathersit:hum + holiday + season:tempratAbs)
# 5x2 cross validation, this is explained in detail later under the boosted trees section
for (m in 1:mm) {
set.seed(m)
folds = sample(1:5, nrow(day), replace = TRUE)
for (j in 1:5) {
set.seed(1)
glm.2 = glm(formula.predictive, data=day[folds!=j,])
predicted = predict(glm.2, newdata = day[folds == j,])
if (m == 1) {
#first loop, cbind adds an index which we will merge on later, rbind combines all 731 predictions into one object
day.predicted = cbind(day$instant[folds == j], '3' = predicted)
totalpredicted1 = rbind(totalpredicted1, day.predicted)
}
else{
#second loop
day.predicted = cbind(day$instant[folds == j], '4' = predicted)
totalpredicted2 = rbind(totalpredicted2, day.predicted)
}
}
}
totalpredictednew = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
totalpredictednew$average = rowMeans(totalpredictednew[, 2:3],)
#(((totalpredictednew$average - day$casual)*100/day$casual)^2)
glmresiduals = totalpredictednew$average - day$casual
ResidualMagnitudeSquares = ((abs(glmresiduals) + day$casual) / day$casual) ^2
RMSS = sum( ResidualMagnitudeSquares)
RMSS
plot(1:731,ResidualMagnitudeSquares)
Diagnostic plots, here I take the outliers, the elements with the largest residuals, and the elements with the largest leverage and try to determine what features I could add/modify in order to improve the model fit
d=302
#diag.cols = c(3:5, 7:10, 12:13, 17:20,23, 25, 26:32 )
diag.cols = c('week', 'seasonWeekend', 'atemp', 'previousweekCasual')
par(mfrow=c(2,2), mar=c(4,4,4,4),oma = c(0, 0, 2, 2))
for(x in diag.cols)
{
plot(day[,x],day[,14], pch = 18,col=day.cols ,xlab = colnames(day)[x], ylab='Casual Users')
if(x=='seasonWeekend'){
legend(x = "topright",
inset = c(0, -.25), # You will need to fine-tune the first
# value depending on the windows size
legend = c("448"),
pch=17,
col = 2,
xpd = TRUE)
}
points(day[d,x], day[d,14],pch=17, col=2)
}
Loading the best value from previous tests
library(gbm)
## Loaded gbm 2.1.8
load('bikepercentage1.RData')
The following code performances stepwise selection and parameter selection through cross validation. First I’ll run the model from the last time I did stepwise selection for my gbm to get a baseline.
interactions = 3
flag = FALSE
#folds
k = 5
#number of times we preform the 5 fold cv
mm = 2
# when you get to the point that ''##begin 5fold cv x 2'' shows up, this is the code that runs. We run it here to get a baseline error. Code comments for this cv will be listed here, and left out in subsequent iterations
#these variables store all of the predictions our models make. Doing 5 fold cv, you predict on every observation. Since we are doing 5 fold X 2, we will predict on every observation twice. So we will take all of our predictions, and for each observation we will average our predictions, then calculate RSS as normal.
totalpredicted1 = 0
totalpredicted2 = 0
# outer loop iterates twice
for (m in 1:mm) {
set.seed(m)
# calculate new folds for each outer loop. This will help average out potential over/under fitting because our model will use 2 randomized training sets to predict on each observation.
folds = sample(1:k, nrow(day1), replace = TRUE)
#standard 5 fold.
for (j in 1:k) {
set.seed(1)
day1.boost = gbm(
casual ~ .,
data = day1[folds != j,],
n.trees = besttree,
interaction.depth = interactions,
shrinkage = bestshrink,
n.cores = 4
)
predicted = predict(day1.boost, newdata = day1[folds == j,], n.trees = besttree)
if (m == 1) {
#first loop, cbind adds an index which we will merge on later, rbind combines all 731 predictions into one object
day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
totalpredicted1 = rbind(totalpredicted1, day1.predicted)
}
else{
#second loop
day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
totalpredicted2 = rbind(totalpredicted2, day1.predicted)
}
}
}
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
## Distribution not specified, assuming gaussian ...
#merge all of our predictions, then average our predictions for each instance, then use the averaged prediction to calculate the RSS
totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
totalpredicted$average = rowMeans(totalpredicted[, 2:3])
par(mfrow=c(1,1), mar=c(4,4,4,4),oma = c(1,1,1,1))
newresiduals = ((totalpredicted$average + totalpredictednew)/2) - day1$casual
plot(1:731, newresiduals, xlab = 'instance', ylab = 'residual')
baseline = sum(((abs(newresiduals) + day1$casual)/day1$casual)^2)
baselineerror = mean((newresiduals ) ^ 2)
Begin Stepwise selection and parameter tuning
#this outerloop alternates between stepwise selection and parameter tuning
for (z in 1:5) {
#This is the stepwise selection loop, first we check if adding a feature improves performance, then we check if removing a feature improves performance
# currently checking if we need to add/remove features 10 times before moving on to parameter tuning
flag=FALSE
for (i in 1:5) {
print(i)
print(names(day1))
print(baseline)
#day1.baseline is our checkpoint, once we add/remove a feature to the model we use day1.baseline to reset to before the add/remove
day1.baseline = day1
day1.boost.cv.add = 1:length(withheld)
#This variable is used to see if we actually added any features for each loop, if we dont add/remove features for one loop, then we will exit the loop and go back to shrinkage/tree selection
old.names = names(day1)
# # # add features
print('adding')
for (x in 1:length(withheld)) {
print(x)
day1 = day1.baseline
day1 = cbind(day1, daycopy[which(names(daycopy) == withheld[x])])
##begin 5 fold cv X 2
totalpredicted1 = 0
totalpredicted2 = 0
for (m in 1:mm) {
set.seed(m)
folds = sample(1:k, nrow(day1), replace = TRUE)
for (j in 1:k) {
set.seed(1)
day1.boost = gbm(
casual ~ .,
data = day1[folds != j,],
n.trees = besttree,
interaction.depth = interactions,
shrinkage = bestshrink,
n.cores = 4,
distribution = 'gaussian'
)
predicted = predict(day1.boost,
newdata = day1[folds == j,],
n.trees = besttree)
if (m == 2) {
day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
totalpredicted2 = rbind(totalpredicted2, day1.predicted)
}
else{
day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
totalpredicted1 = rbind(totalpredicted1, day1.predicted)
}
}
}
##end 5 fold cv
totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
totalpredicted$average = rowMeans(totalpredicted[, 2:3],)
newresiduals = ((totalpredicted$average + totalpredictednew) / 2) - day1$casual
#report the cv error for the x-th feature
#day1.boost.cv.add[x] = mean((totalpredicted$average - day1$casual) ^ 2)
#day1.boost.cv.add[x] = mean((abs(((totalpredicted$average + totalpredictednew$average)/2) - day1$casual)/day1$casual))
day1.boost.cv.add[x] = sum(((abs(newresiduals) + day1$casual) / day1$casual) ^
2)
#day1.boost.cv.add[x] = mean((totalpredicted$average - day1$casual + glmresiduals) ^ 2)
}
#check if any of the x features that are currently withheld make the model better, if they do, add them to the model
day1 = day1.baseline
if (min(day1.boost.cv.add) < baseline) {
print('update')
plot(1:731, residuals)
#update our new best error
baseline = min(day1.boost.cv.add)
#index for the best addition
best = which.min(day1.boost.cv.add)
print(withheld[best])
#add it back to the model with cbind
readd = which(names(daycopy) == withheld[best])
day1 = cbind(day1, daycopy[readd])
#update our withheld list
withheld = withheld[-best]
}
#update our checkpoint
day1.baseline = day1
day1.boost.cv.remove = 1:length(names(day1))
# # # remove features
print('removing')
for (x in 2:length(names(day1))) {
print(x)
day1 = day1.baseline[-c(x)]
##begin 5 fold cv X 2
totalpredicted1 = 0
totalpredicted2 = 0
for (m in 1:mm) {
set.seed(m)
folds = sample(1:k, nrow(day1), replace = TRUE)
for (j in 1:k) {
set.seed(1)
day1.boost = gbm(
casual ~ .,
data = day1[folds != j,],
n.trees = besttree,
interaction.depth = interactions,
shrinkage = bestshrink,
n.cores = 4,
distribution = 'gaussian'
)
predicted = predict(day1.boost,
newdata = day1[folds == j,],
n.trees = besttree)
if (m == 2) {
day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
totalpredicted2 = rbind(totalpredicted2, day1.predicted)
}
else{
day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
totalpredicted1 = rbind(totalpredicted1, day1.predicted)
}
}
}
##end 5 fold cv
totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
totalpredicted$average = rowMeans(totalpredicted[, 2:3],)
newresiduals = ((totalpredicted$average + totalpredictednew) / 2) - day1$casual
#report the cv error for the x-th feature
day1.boost.cv.remove[x] = sum(((abs(newresiduals) + day1$casual) /
day1$casual) ^ 2)
}
#reset to checkpoint
day1 = day1.baseline
removederror = min(day1.boost.cv.remove[-1])
#check if removing makes things better
if (removederror < baseline) {
print('update')
plot(1:731, residuals)
worst = which.min(day1.boost.cv.remove[-1]) + 1
day1 = day1[-worst]
day1.removed = names(day1.baseline[worst])
print(day1.removed)
withheld = c(day1.removed, withheld)
baseline = removederror
}
#checking to see if anything was added/removed
if (identical(old.names, names(day1))) {
#if nothing was added on the very first iteration break out of the stepwise selection
if (i == 1) {
flag = TRUE
}
break
}
}
# this sets the ideal shrinkage and tree value by iterating over both.
trees = floor(seq(1000, 7000, length.out = 8))
shrinks = c(seq(.001, .05, length.out = 10))
#checkpoint the best tree value to see if we changed it at all, if we haven't changed the tree value, and we haven't added/removed, break out of the loop and report the predictors used for the best model.
old.tree = besttree
min.loop.error = 0
for (y in 1:length(trees)) {
print(y)
for (x in 1:length(shrinks)) {
##begin 5 fold cv X 2
totalpredicted1 = 0
totalpredicted2 = 0
for (m in 1:mm) {
set.seed(m)
folds = sample(1:k, nrow(day1), replace = TRUE)
for (j in 1:k) {
set.seed(1)
day1.boost = gbm(
casual ~ .,
data = day1[folds != j, ],
n.trees = trees[y],
interaction.depth = interactions,
shrinkage = shrinks[x],
n.cores = 4,
distribution = 'gaussian'
)
predicted = predict(day1.boost,
newdata = day1[folds == j,],
n.trees = trees[y])
if (m == 2) {
day1.predicted = cbind(daycopy[folds == j, 1], '2' = predicted)
totalpredicted2 = rbind(totalpredicted2, day1.predicted)
}
else{
day1.predicted = cbind(daycopy[folds == j, 1], '1' = predicted)
totalpredicted1 = rbind(totalpredicted1, day1.predicted)
}
}
}
##end 5 fold cv
totalpredicted = merge(totalpredicted1, totalpredicted2, by = 1)[-1, ]
totalpredicted$average = rowMeans(totalpredicted[, 2:3],)
newresiduals = ((totalpredicted$average + totalpredictednew) / 2) - day1$casual
#set the error for the current loop
loop.error = sum(((abs(newresiduals) + day1$casual) / day1$casual) ^
2)
#record the minimum error for the current loop
if (x == 1) {
min.loop.error = loop.error
} else if (loop.error < min.loop.error) {
min.loop.error = loop.error
}
#print for diagnostic
print(loop.error)
#check if the loop error is better than the current best parameters, if it is use the current parameters as the best
if (loop.error < baseline) {
print('update')
plot(1:731, residuals)
baseline = loop.error
besttree = trees[y]
bestshrink = shrinks[x]
}
#At first the loop will decresase in error each time since the shrinkage parameter is increasing, but at a certain point the loop will increase in error again, once this happens we can break the loop to speed up the overall process
if (loop.error / min.loop.error > 1.01) {
break
}
}
}
if (old.tree == besttree & flag == TRUE) {
break
}
}
names(day1)
averagedpredicted = rowMeans(cbind(totalpredicted$average,totalpredictednew))
bestresiduals = averagedpredicted - day1$casual
besterror = mean((averagedpredicted - day1$casual) ^ 2)
save(withheld, besttree, bestshrink, baseline, day1, file='bikepercentage1.RData')
# For 69% of days, the estimate of this model is within 25% of what was actually experienced.
length(which(abs((averagedpredicted - day$casual) / day$casual ) < .25))/731
## [1] 0.6935705
length(which(abs(bestresiduals)<200))/731
## [1] 0.7633379
with(dayplot, which(mnth %in% 3:9))
how many are within 25% of target
length(which(abs((averagedpredicted - day\(casual) / day\)casual ) > .25))/731